{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-50","slug":"mlfoundations--mint-1t-pdf-cc-2023-50","name":"MINT-1T-PDF-CC-2023-50","type":"dataset","url":"https://huggingface.co/datasets/mlfoundations/MINT-1T-PDF-CC-2023-50","page_url":"https://unfragile.ai/mlfoundations--mint-1t-pdf-cc-2023-50","categories":["model-training"],"tags":["task_categories:image-to-text","task_categories:text-generation","language:en","license:cc-by-4.0","size_categories:1M<n<10M","format:webdataset","modality:image","modality:text","library:datasets","library:webdataset","library:mlcroissant","arxiv:2406.11271","region:us","multimodal"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-50__cap_0","uri":"capability://data.processing.analysis.multimodal.pdf.to.text.extraction.at.scale","name":"multimodal pdf-to-text extraction at scale","description":"Extracts text and image content from 796K+ PDF documents sourced from Common Crawl 2023, using a structured pipeline that preserves document layout and image-text relationships. The dataset uses WebDataset format for efficient streaming access to tar-archived samples, enabling distributed training without requiring full dataset materialization. Implementation leverages MLCroissant metadata standards to expose dataset schema and provenance, making it compatible with automated data discovery and validation workflows.","intents":["Train vision-language models on real-world PDF documents with preserved spatial relationships","Build document understanding systems that handle mixed text and image content","Create datasets for OCR and document layout analysis tasks","Evaluate multimodal models on naturally-occurring document structures"],"best_for":["ML researchers training vision-language models (CLIP, LLaVA, etc.)","Teams building document intelligence systems for enterprise use","Researchers studying multimodal learning on real-world data distributions"],"limitations":["English-only content — no multilingual document support","Fixed to 2023 Common Crawl snapshot — no real-time updates or historical versions","WebDataset format requires compatible loaders; not directly compatible with standard PyTorch DataLoader without adapter code","Image quality varies by source PDF; no quality filtering or deduplication applied","No built-in train/val/test splits — requires manual partitioning for reproducible experiments"],"requires":["Python 3.8+","HuggingFace datasets library (>=2.0)","webdataset library for tar-based streaming","~500GB+ disk space for full dataset or streaming access via HuggingFace Hub","mlcroissant library for metadata inspection (optional)"],"input_types":["PDF documents (raw binary from Common Crawl)","Document URLs and metadata"],"output_types":["Extracted text (UTF-8 strings with layout preservation)","Image tensors (PIL Image or numpy arrays)","Document metadata (source URL, extraction timestamp, page count)"],"categories":["data-processing-analysis","multimodal-dataset"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-50__cap_1","uri":"capability://data.processing.analysis.streaming.dataset.access.via.webdataset.protocol","name":"streaming dataset access via webdataset protocol","description":"Implements efficient streaming access to 796K+ samples through WebDataset tar-archive format, allowing models to load batches directly from cloud storage without full dataset materialization. The architecture uses tar-based sharding with configurable batch sizes, enabling distributed training across multiple GPUs/TPUs by streaming different tar shards to different workers. Integration with HuggingFace Hub provides automatic caching, resumable downloads, and version management.","intents":["Train models on large datasets without local storage constraints","Distribute dataset loading across multiple training nodes with minimal coordination","Resume interrupted training without re-downloading already-processed samples","Reduce training startup time by streaming data on-demand rather than pre-staging"],"best_for":["Teams with distributed training infrastructure (multi-GPU, multi-node setups)","Researchers with limited local storage but good network bandwidth","Production ML pipelines requiring fault-tolerant data loading"],"limitations":["Sequential access pattern within tar archives — random access requires full archive decompression","Network latency adds ~50-200ms per tar shard fetch depending on cloud region","Requires compatible training framework integration (PyTorch Lightning, Hugging Face Transformers); raw PyTorch DataLoader needs adapter code","Streaming from HuggingFace Hub throttles at ~100MB/s per connection; parallel downloads require multiple worker processes","No built-in deduplication — duplicate documents across tar shards not removed"],"requires":["webdataset Python library (>=0.2.0)","HuggingFace datasets library (>=2.0)","Network bandwidth >=10 Mbps for practical training throughput","PyTorch or TensorFlow with distributed training support"],"input_types":["Tar-archived sample collections","Dataset configuration (shard indices, batch size, worker count)"],"output_types":["Batched tensors (images, text)","Sample metadata dictionaries","Streaming iterators compatible with training loops"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-50__cap_2","uri":"capability://data.processing.analysis.mlcroissant.metadata.schema.exposure","name":"mlcroissant metadata schema exposure","description":"Exposes dataset structure, provenance, and licensing through MLCroissant metadata standard, enabling automated discovery, validation, and integration with data governance tools. The metadata includes field schemas (text vs. image), record counts, source attribution (Common Crawl 2023), and CC-BY-4.0 licensing terms. This enables downstream tools to automatically validate data compatibility, generate data cards, and enforce licensing compliance without manual inspection.","intents":["Automatically discover and validate dataset compatibility with model training pipelines","Generate reproducible data cards for model documentation and transparency","Enforce licensing compliance in automated ML workflows","Enable data lineage tracking from raw PDFs through extracted samples"],"best_for":["ML teams with governance requirements (compliance, licensing tracking)","Researchers publishing models and needing transparent data attribution","Automated ML platforms building dataset discovery and validation layers"],"limitations":["MLCroissant standard is still evolving — schema may change in future versions","Metadata does not include per-sample quality scores or filtering recommendations","No built-in validation of actual extracted content against declared schema","Licensing metadata (CC-BY-4.0) is declarative only — no enforcement mechanism in the dataset itself"],"requires":["mlcroissant library (>=0.3.0) for metadata parsing","JSON schema validation tools (optional, for custom compliance checks)"],"input_types":["MLCroissant JSON metadata file"],"output_types":["Parsed schema (field names, types, cardinality)","Provenance metadata (source, timestamp, version)","Licensing and attribution information"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-50__cap_3","uri":"capability://data.processing.analysis.common.crawl.pdf.document.sourcing.and.deduplication","name":"common crawl pdf document sourcing and deduplication","description":"Sources 796K+ PDF documents from Common Crawl 2023 snapshot using URL-based deduplication and content filtering to ensure dataset diversity. The pipeline crawls Common Crawl's WARC archives, extracts PDF URLs, filters by document type and size, and deduplicates based on URL canonicalization and optional content hashing. This ensures the dataset represents a broad cross-section of real-world PDFs rather than duplicates or spam.","intents":["Build training datasets from real-world document distributions without manual curation","Ensure dataset diversity by filtering spam and near-duplicate PDFs","Create reproducible datasets tied to specific Common Crawl snapshots for research transparency","Scale document collection to millions of samples without manual annotation"],"best_for":["Researchers building large-scale document understanding models","Teams needing representative samples of real-world PDF distributions","Organizations requiring transparent, reproducible data sourcing"],"limitations":["Fixed to 2023 Common Crawl snapshot — no real-time updates or ability to add newer documents","URL-based deduplication may miss semantically duplicate content with different URLs","No filtering for document quality, readability, or relevance — includes low-quality scans and corrupted PDFs","Extraction quality depends on PDF structure; scanned images without OCR produce empty text","No filtering for sensitive content (PII, medical records, etc.) — requires downstream filtering"],"requires":["Access to Common Crawl WARC archives (publicly available via AWS S3)","PDF parsing library (PyPDF2, pdfplumber, or similar)","URL canonicalization and deduplication logic"],"input_types":["Common Crawl 2023 WARC index","PDF URLs and metadata"],"output_types":["Deduplicated PDF document collection","Extracted text and images per document","Source URL and crawl metadata"],"categories":["data-processing-analysis","search-retrieval"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-50__cap_4","uri":"capability://data.processing.analysis.image.text.spatial.relationship.preservation.in.document.extraction","name":"image-text spatial relationship preservation in document extraction","description":"Preserves spatial layout and image-text relationships during PDF extraction, maintaining document structure rather than flattening to generic image-caption pairs. The extraction pipeline preserves page coordinates, image bounding boxes, and text positioning, enabling downstream models to learn document layout patterns. This is critical for tasks like table extraction, form understanding, and document classification where spatial relationships carry semantic meaning.","intents":["Train models that understand document layout and structure, not just isolated images and text","Build systems for table extraction and form understanding that rely on spatial relationships","Create datasets for document classification tasks where layout is semantically meaningful","Evaluate vision-language models on their ability to reason about document structure"],"best_for":["Researchers building document intelligence systems (table extraction, form parsing)","Teams training layout-aware vision-language models","Organizations building document classification systems"],"limitations":["Spatial metadata increases dataset size by ~20-30% compared to flattened image-text pairs","Coordinate systems vary by PDF library — no standardization across different extraction tools","Complex layouts (multi-column, rotated text) may not preserve correctly depending on PDF structure","No built-in visualization tools for validating spatial relationships — requires custom inspection code","Training frameworks need custom collate functions to handle variable-length spatial metadata"],"requires":["PDF parsing library with layout analysis (pdfplumber, PyPDF2 with layout plugins)","Custom data loaders to handle spatial metadata alongside images and text"],"input_types":["PDF documents with embedded images and text"],"output_types":["Images with bounding box coordinates","Text with page coordinates and positioning","Document structure metadata (page layout, text flow order)"],"categories":["data-processing-analysis","image-visual"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-mlfoundations--mint-1t-pdf-cc-2023-50__cap_5","uri":"capability://data.processing.analysis.cc.by.4.0.licensed.dataset.with.transparent.attribution","name":"cc-by-4.0 licensed dataset with transparent attribution","description":"Provides dataset under CC-BY-4.0 open license with transparent source attribution to Common Crawl and original document creators. The licensing model enables commercial and research use with attribution requirements, and the dataset includes source URL metadata enabling downstream users to provide proper attribution. This transparency supports reproducible research and compliance with open licensing standards.","intents":["Use dataset for commercial model training while maintaining legal compliance","Build reproducible research with transparent data provenance and attribution","Contribute to open-source AI development with clear licensing terms","Ensure downstream users can provide proper attribution to original creators"],"best_for":["Researchers and companies requiring open-source training data with commercial use rights","Organizations with strict licensing compliance requirements","Open-source AI projects needing legally clear training data"],"limitations":["CC-BY-4.0 requires attribution in derived works — no enforcement mechanism in dataset itself","Original PDF creators may not have intended their documents for ML training — ethical concerns despite legal compliance","Some PDFs may contain third-party copyrighted content (images, text) not covered by CC-BY-4.0","No mechanism to opt-out or request removal of specific documents from the dataset"],"requires":["Understanding of CC-BY-4.0 license terms and attribution requirements","Mechanism to track and provide attribution to source URLs in downstream models"],"input_types":["CC-BY-4.0 license declaration","Source URL metadata for attribution"],"output_types":["Licensed dataset with clear attribution requirements","Metadata enabling downstream attribution"],"categories":["data-processing-analysis","safety-moderation"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":23,"verified":false,"data_access_risk":"low","permissions":["Python 3.8+","HuggingFace datasets library (>=2.0)","webdataset library for tar-based streaming","~500GB+ disk space for full dataset or streaming access via HuggingFace Hub","mlcroissant library for metadata inspection (optional)","webdataset Python library (>=0.2.0)","Network bandwidth >=10 Mbps for practical training throughput","PyTorch or TensorFlow with distributed training support","mlcroissant library (>=0.3.0) for metadata parsing","JSON schema validation tools (optional, for custom compliance checks)"],"failure_modes":["English-only content — no multilingual document support","Fixed to 2023 Common Crawl snapshot — no real-time updates or historical versions","WebDataset format requires compatible loaders; not directly compatible with standard PyTorch DataLoader without adapter code","Image quality varies by source PDF; no quality filtering or deduplication applied","No built-in train/val/test splits — requires manual partitioning for reproducible experiments","Sequential access pattern within tar archives — random access requires full archive decompression","Network latency adds ~50-200ms per tar shard fetch depending on cloud region","Requires compatible training framework integration (PyTorch Lightning, Hugging Face Transformers); raw PyTorch DataLoader needs adapter code","Streaming from HuggingFace Hub throttles at ~100MB/s per connection; parallel downloads require multiple worker processes","No built-in deduplication — duplicate documents across tar shards not removed","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.05,"quality":0.22,"ecosystem":0.5000000000000001,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.3,"quality":0.25,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.764Z","last_scraped_at":"2026-04-22T08:08:14.361Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=mlfoundations--mint-1t-pdf-cc-2023-50","compare_url":"https://unfragile.ai/compare?artifact=mlfoundations--mint-1t-pdf-cc-2023-50"}},"signature":"V6w3XFNyCeVjbFM0m0/Z7/1Jq/MsiOnymMlRCH751hFM3K6Ix9nDTPzGFH4/7Ao7x6JcKknrsT+jmlL+q15MBw==","signedAt":"2026-06-22T13:59:18.666Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/mlfoundations--mint-1t-pdf-cc-2023-50","artifact":"https://unfragile.ai/mlfoundations--mint-1t-pdf-cc-2023-50","verify":"https://unfragile.ai/api/v1/verify?slug=mlfoundations--mint-1t-pdf-cc-2023-50","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}