{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2024-18","slug":"mlfoundations--mint-1t-pdf-cc-2024-18","name":"MINT-1T-PDF-CC-2024-18","type":"dataset","url":"https://huggingface.co/datasets/mlfoundations/MINT-1T-PDF-CC-2024-18","page_url":"https://unfragile.ai/mlfoundations--mint-1t-pdf-cc-2024-18","categories":["model-training"],"tags":["task_categories:image-to-text","task_categories:text-generation","language:en","license:cc-by-4.0","size_categories:100B<n<1T","arxiv:2406.11271","region:us","multimodal"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2024-18__cap_0","uri":"capability://data.processing.analysis.large.scale.multimodal.document.image.dataset.curation.and.indexing","name":"large-scale multimodal document-image dataset curation and indexing","description":"Provides a 1 trillion token-scale dataset of PDF documents paired with extracted images and text, curated from Common Crawl with deduplication and quality filtering applied at scale. The dataset uses HuggingFace's distributed dataset infrastructure to enable efficient streaming and sampling of 1M+ document-image pairs without requiring full local storage, with metadata indexing for retrieval by document type, language, and content characteristics.","intents":["Train vision-language models on real-world document understanding tasks at scale","Build datasets for PDF-to-text and image-to-text extraction models","Evaluate multimodal model performance on document comprehension benchmarks","Create domain-specific training corpora by filtering and sampling from the full dataset"],"best_for":["ML researchers training large vision-language models (LLaVA, GPT-4V competitors)","Teams building document processing pipelines requiring diverse training data","Organizations developing OCR and document understanding systems"],"limitations":["1T tokens requires significant computational resources for full training — most practitioners sample subsets","PDF extraction quality varies by document structure; scanned/image-heavy PDFs may have degraded text extraction","Dataset is English-dominant; limited multilingual coverage despite CC-BY-4.0 license allowing derivative works","No built-in document type stratification — requires custom filtering to balance document categories"],"requires":["HuggingFace Datasets library (>=2.14.0) for streaming access","Minimum 50GB free disk space for partial caching; 10TB+ for full local mirror","Python 3.8+ with PyTorch or TensorFlow for model training integration","Network bandwidth for streaming from HuggingFace Hub (~100-500 Mbps recommended)"],"input_types":["PDF documents (raw binary)","Common Crawl WARC records (source format)","Metadata queries (document type, language, content hash)"],"output_types":["Structured dataset records (image tensors, text strings, metadata JSON)","Arrow/Parquet format for efficient columnar storage","Streaming batches for PyTorch DataLoader or TensorFlow tf.data pipelines"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2024-18__cap_1","uri":"capability://data.processing.analysis.streaming.dataset.access.with.lazy.loading.and.memory.efficient.batching","name":"streaming dataset access with lazy loading and memory-efficient batching","description":"Implements HuggingFace Datasets' streaming protocol to load document-image pairs on-demand without downloading the full 1T token dataset, using memory-mapped Arrow format and distributed sharding across multiple processes. Batching is handled through configurable DataLoader wrappers that respect image tensor dimensions and text sequence lengths, enabling training on machines with limited VRAM through dynamic batch size adjustment.","intents":["Train models on large datasets without requiring multi-terabyte local storage","Parallelize data loading across multiple GPUs/TPUs with automatic shard distribution","Prototype and iterate on model architectures without waiting for full dataset downloads","Monitor data quality and distribution during training with streaming statistics"],"best_for":["Researchers with GPU clusters but limited NVMe storage","Teams using cloud training (AWS SageMaker, GCP Vertex AI) with per-instance bandwidth constraints","Iterative model development requiring rapid experimentation cycles"],"limitations":["Streaming introduces ~50-200ms latency per batch due to network I/O and decompression — not suitable for real-time inference","Deterministic shuffling requires maintaining epoch-level state; distributed training needs careful synchronization to avoid duplicate batches","Image tensor shapes vary (PDFs have different page dimensions); requires padding or resizing, adding preprocessing overhead","No built-in caching strategy — repeated epochs re-download identical data unless external cache layer is added"],"requires":["HuggingFace Datasets >=2.14.0 with streaming support","PyTorch DataLoader or TensorFlow tf.data for batching integration","Network connectivity to HuggingFace Hub (CDN-cached, but requires ~100 Mbps sustained)","Python 3.8+ with numpy and PIL/Pillow for image preprocessing"],"input_types":["Dataset configuration (split name, streaming=True flag)","Batch size and number of workers","Image preprocessing parameters (resize, normalize)"],"output_types":["PyTorch tensors (images: [B, C, H, W], text: [B, seq_len])","TensorFlow tf.data.Dataset objects","Raw dictionaries with 'image', 'text', 'metadata' keys"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2024-18__cap_2","uri":"capability://data.processing.analysis.document.image.pair.extraction.and.alignment.from.pdf.sources","name":"document-image pair extraction and alignment from pdf sources","description":"Extracts text and images from PDF documents using OCR and layout analysis, then aligns extracted text with corresponding page images through spatial coordinate matching and text-region association. The extraction pipeline handles multi-page PDFs, preserves document structure metadata (headers, footers, sections), and deduplicates near-identical documents using perceptual hashing and text similarity metrics to ensure dataset quality.","intents":["Create training pairs for vision-language models that understand document layout and content","Build datasets for document classification, information extraction, and table understanding tasks","Evaluate OCR and document understanding models on real-world PDF diversity","Preserve document structure information for downstream layout-aware model training"],"best_for":["Teams building document understanding systems (invoice processing, form extraction, contract analysis)","Researchers developing layout-aware vision-language models","Organizations needing high-quality document-image pairs with minimal manual annotation"],"limitations":["OCR quality degrades on scanned documents with poor image quality, handwriting, or non-Latin scripts — affects ~15-20% of Common Crawl PDFs","Text-image alignment assumes regular document layouts; complex multi-column layouts or overlapping text regions may have misalignment errors","Deduplication uses heuristic similarity thresholds; near-duplicate documents with minor variations may not be fully deduplicated","Metadata preservation is lossy — complex PDF annotations, form fields, and embedded media are not fully captured"],"requires":["PDF parsing library (PyPDF2, pdfplumber, or similar) for text extraction","OCR engine (Tesseract, EasyOCR, or cloud-based) for image-based text recognition","Image processing library (Pillow, OpenCV) for page rendering and coordinate transformation","Hashing library (imagehash, hashlib) for deduplication"],"input_types":["PDF files (binary format, any size up to 100MB+)","Page images (rendered from PDFs at 150-300 DPI)","Document metadata (URL, crawl date, source domain)"],"output_types":["Structured records: {image: PIL.Image, text: str, metadata: {page_num, source_url, ...}}","Alignment metadata: {text_regions: [{bbox, text, confidence}], ...}","Deduplication hashes: {perceptual_hash, text_hash, similarity_score}"],"categories":["data-processing-analysis","image-visual"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2024-18__cap_3","uri":"capability://data.processing.analysis.common.crawl.sourced.dataset.with.quality.filtering.and.language.detection","name":"common crawl-sourced dataset with quality filtering and language detection","description":"Ingests documents from Common Crawl's WARC archives, applies language detection (likely using fastText or similar) to filter for English content, and runs quality heuristics (text-to-image ratio, document length, spam detection) to remove low-quality or malicious PDFs. The filtering pipeline is applied during dataset construction, reducing the raw crawl from billions of documents to 1M+ high-quality document-image pairs with reproducible filtering criteria.","intents":["Access large-scale, diverse real-world document data without manual curation","Train models on naturally-occurring document distributions rather than synthetic or curated datasets","Understand document diversity and quality characteristics across the public web","Build reproducible datasets with transparent filtering criteria for research transparency"],"best_for":["Researchers requiring large-scale, diverse training data with public provenance","Teams building production document systems that need to handle real-world document variety","Organizations prioritizing dataset transparency and reproducibility"],"limitations":["Common Crawl has inherent biases toward English-language, Western-hosted content — non-English and non-Latin script documents are underrepresented","Quality filtering is heuristic-based and may remove valid documents (e.g., minimalist designs with low text-to-image ratio) or retain low-quality ones","Copyright and licensing compliance is user responsibility — CC-BY-4.0 license covers the dataset metadata, but source PDFs may have different licenses","Dataset is static (2024-18 snapshot) — does not reflect real-time web changes or new document types"],"requires":["Understanding of Common Crawl WARC format and S3 access (if processing raw crawl data)","Language detection library (fastText, langdetect) for filtering","Spam/quality detection heuristics (text entropy, domain reputation, etc.)","Reproducible random seed for deterministic filtering"],"input_types":["Common Crawl WARC records (raw crawl data)","PDF URLs and metadata from crawl index","Quality filtering parameters (min text length, max spam score, etc.)"],"output_types":["Filtered dataset records with quality scores and filtering rationale","Metadata: {source_url, crawl_date, language, quality_score, filtering_reason}","Statistics: {total_documents, filtered_count, quality_distribution}"],"categories":["data-processing-analysis","search-retrieval"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2024-18__cap_4","uri":"capability://data.processing.analysis.multimodal.dataset.sampling.and.stratification.for.balanced.model.training","name":"multimodal dataset sampling and stratification for balanced model training","description":"Provides mechanisms to sample subsets of the 1T token dataset with control over document type distribution, image-text ratio, and content characteristics. Sampling can be stratified by document category (academic papers, web pages, forms, etc.) or by content properties (text length, image density, language) to ensure training data reflects desired distributions rather than raw web frequencies, which are heavily skewed toward common document types.","intents":["Create balanced training sets that represent diverse document types equally","Evaluate model performance on specific document categories without training on full dataset","Mitigate dataset bias toward common document types (web pages) by oversampling rare types (forms, tables)","Experiment with different data distributions to understand their impact on model performance"],"best_for":["Researchers studying how document diversity affects vision-language model performance","Teams building domain-specific document systems (e.g., financial document processing) requiring balanced training data","Practitioners with limited compute wanting to train on representative subsets"],"limitations":["Stratification requires pre-computed metadata (document type, content properties) — metadata quality affects sampling quality","Sampling without replacement can exhaust rare categories if subset size is large relative to category frequency","No built-in mechanism to enforce stratification across distributed training — requires custom sampler implementation","Stratified sampling may not reflect real-world document distributions, potentially hurting generalization"],"requires":["HuggingFace Datasets with custom sampler implementation or PyTorch DistributedSampler","Pre-computed stratification metadata (document type, content properties)","Random seed for reproducible sampling"],"input_types":["Sampling parameters: {subset_size, stratification_key, distribution_target}","Stratification metadata: {document_type, text_length, image_count, ...}","Random seed for reproducibility"],"output_types":["Sampled dataset with specified distribution","Sampling statistics: {actual_distribution, target_distribution, sampling_ratio_per_stratum}","Sample indices for reproducibility"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2024-18__cap_5","uri":"capability://data.processing.analysis.metadata.rich.document.records.with.source.attribution.and.quality.scores","name":"metadata-rich document records with source attribution and quality scores","description":"Each dataset record includes rich metadata beyond image and text: source URL, crawl date, document type classification, quality score, OCR confidence, text-image alignment score, and deduplication information. Metadata is structured as JSON and queryable, enabling filtering and analysis without loading full images/text, and providing traceability for reproducibility and copyright attribution.","intents":["Filter dataset by quality, document type, or source characteristics without loading full records","Trace document provenance for copyright compliance and citation","Analyze dataset composition and quality distribution","Reproduce dataset construction by querying filtering criteria"],"best_for":["Researchers requiring dataset transparency and reproducibility","Teams managing copyright and licensing compliance","Practitioners analyzing dataset bias and quality characteristics"],"limitations":["Metadata is only as good as extraction quality — OCR confidence scores may be inaccurate for complex layouts","Document type classification is automated and may have errors — manual verification required for critical applications","Metadata schema may not capture all relevant properties — users may need custom metadata extraction","Metadata queries require loading metadata for all records, which can be slow for billion-scale datasets"],"requires":["HuggingFace Datasets with metadata column support","JSON parsing for metadata extraction","Query language or filtering library (pandas, DuckDB) for metadata analysis"],"input_types":["Metadata query filters: {document_type, min_quality_score, source_domain, ...}","Metadata fields to retrieve"],"output_types":["Filtered dataset records with selected metadata","Metadata statistics: {quality_distribution, document_type_counts, source_domain_distribution}","Metadata-only dataset for analysis without loading images/text"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":23,"verified":false,"data_access_risk":"low","permissions":["HuggingFace Datasets library (>=2.14.0) for streaming access","Minimum 50GB free disk space for partial caching; 10TB+ for full local mirror","Python 3.8+ with PyTorch or TensorFlow for model training integration","Network bandwidth for streaming from HuggingFace Hub (~100-500 Mbps recommended)","HuggingFace Datasets >=2.14.0 with streaming support","PyTorch DataLoader or TensorFlow tf.data for batching integration","Network connectivity to HuggingFace Hub (CDN-cached, but requires ~100 Mbps sustained)","Python 3.8+ with numpy and PIL/Pillow for image preprocessing","PDF parsing library (PyPDF2, pdfplumber, or similar) for text extraction","OCR engine (Tesseract, EasyOCR, or cloud-based) for image-based text recognition"],"failure_modes":["1T tokens requires significant computational resources for full training — most practitioners sample subsets","PDF extraction quality varies by document structure; scanned/image-heavy PDFs may have degraded text extraction","Dataset is English-dominant; limited multilingual coverage despite CC-BY-4.0 license allowing derivative works","No built-in document type stratification — requires custom filtering to balance document categories","Streaming introduces ~50-200ms latency per batch due to network I/O and decompression — not suitable for real-time inference","Deterministic shuffling requires maintaining epoch-level state; distributed training needs careful synchronization to avoid duplicate batches","Image tensor shapes vary (PDFs have different page dimensions); requires padding or resizing, adding preprocessing overhead","No built-in caching strategy — repeated epochs re-download identical data unless external cache layer is added","OCR quality degrades on scanned documents with poor image quality, handwriting, or non-Latin scripts — affects ~15-20% of Common Crawl PDFs","Text-image alignment assumes regular document layouts; complex multi-column layouts or overlapping text regions may have misalignment errors","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.05,"quality":0.22,"ecosystem":0.5000000000000001,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.3,"quality":0.25,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.764Z","last_scraped_at":"2026-04-22T08:08:14.361Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=mlfoundations--mint-1t-pdf-cc-2024-18","compare_url":"https://unfragile.ai/compare?artifact=mlfoundations--mint-1t-pdf-cc-2024-18"}},"signature":"jdrSu02N6hq85xGO+pcpChjeIwDnNghpqJ7gMz8XSUhNS6Ingk5++ZPElvnj46IVtekycaU0IaKqrF37lx/+Dg==","signedAt":"2026-06-20T23:06:12.546Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/mlfoundations--mint-1t-pdf-cc-2024-18","artifact":"https://unfragile.ai/mlfoundations--mint-1t-pdf-cc-2024-18","verify":"https://unfragile.ai/api/v1/verify?slug=mlfoundations--mint-1t-pdf-cc-2024-18","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}