{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-dataset-llm360--txt360","slug":"llm360--txt360","name":"TxT360","type":"dataset","url":"https://huggingface.co/datasets/LLM360/TxT360","page_url":"https://unfragile.ai/llm360--txt360","categories":["model-training"],"tags":["task_categories:text-generation","language:en","license:odc-by","size_categories:n>1T","region:us"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-dataset-llm360--txt360__cap_0","uri":"capability://data.processing.analysis.large.scale.pretraining.corpus.provision.for.language.models","name":"large-scale pretraining corpus provision for language models","description":"TxT360 provides a curated dataset of 360 billion tokens of English text sourced from diverse web, academic, and book sources, designed as a foundation for training or fine-tuning large language models. The dataset is structured for efficient streaming and batch processing via HuggingFace's datasets library, supporting distributed training pipelines that can load data in parallel across multiple GPUs/TPUs without requiring full dataset materialization in memory.","intents":["Train a custom LLM from scratch with a diverse, high-quality English corpus","Fine-tune an existing model on a larger, more representative dataset than proprietary alternatives","Benchmark model performance across different training data compositions","Build reproducible language model training pipelines with open-source data provenance"],"best_for":["Research teams training foundation models with open-source constraints","Organizations seeking data transparency and licensing clarity (ODC-BY license)","ML engineers building distributed training infrastructure for 7B-70B parameter models","Academic researchers studying language model scaling laws and data efficiency"],"limitations":["360B tokens is smaller than proprietary datasets (GPT-3 used ~300B, but with higher quality curation); may require supplementary domain-specific data for specialized tasks","English-only; no multilingual coverage limits applicability for non-English language models","No built-in data filtering for toxic, biased, or low-quality content — requires downstream curation","Streaming from HuggingFace Hub introduces network latency; local mirroring recommended for production training","No dynamic data augmentation or on-the-fly preprocessing; static snapshots only"],"requires":["HuggingFace datasets library (>=2.0.0)","Python 3.8+","Minimum 500GB disk space for local caching (optional but recommended)","HuggingFace account for authenticated access (free tier sufficient)","PyTorch or TensorFlow training framework compatible with HuggingFace integration"],"input_types":["None — dataset is self-contained; consumed directly via HuggingFace API"],"output_types":["Tokenized sequences (variable length, configurable via collate functions)","Raw text strings (for custom preprocessing)","Structured records with metadata (source, domain, timestamp if available)"],"categories":["data-processing-analysis","model-training"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-llm360--txt360__cap_1","uri":"capability://data.processing.analysis.multi.source.text.corpus.aggregation.and.deduplication","name":"multi-source text corpus aggregation and deduplication","description":"TxT360 integrates text from heterogeneous sources (web crawls, book collections, academic papers) into a unified, deduplicated corpus using document-level and token-level deduplication strategies. The aggregation pipeline normalizes encoding, removes near-duplicates via MinHash or similar techniques, and balances source representation to prevent any single source from dominating the training distribution.","intents":["Understand the composition and source breakdown of training data used in a language model","Identify and remove duplicate or near-duplicate documents that waste training capacity","Balance training data across diverse domains to improve model generalization","Audit data provenance and licensing compliance across multiple sources"],"best_for":["Data engineers designing training pipelines with quality-aware corpus construction","Researchers studying the impact of data composition on model capabilities and biases","Teams requiring transparent, auditable data lineage for regulatory compliance","ML practitioners optimizing training efficiency by eliminating redundant data"],"limitations":["Deduplication strategy not fully documented; unclear whether document-level or token-level dedup was prioritized","No public breakdown of source weights or filtering criteria applied per source","Deduplication may remove legitimate repetition (e.g., common phrasings in technical documentation) that aids model learning","No versioning or update mechanism; static snapshot limits ability to incorporate new high-quality sources"],"requires":["HuggingFace datasets library (>=2.0.0)","Python 3.8+","Understanding of corpus composition (metadata may be limited in public release)"],"input_types":["None — aggregation is pre-computed; dataset consumed as-is"],"output_types":["Deduplicated text documents with source attribution (if metadata included)","Token sequences ready for tokenizer input"],"categories":["data-processing-analysis","model-training"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-llm360--txt360__cap_2","uri":"capability://data.processing.analysis.streaming.dataset.access.with.distributed.training.integration","name":"streaming dataset access with distributed training integration","description":"TxT360 is exposed via HuggingFace's streaming API, enabling on-demand loading of data batches without full dataset download, with native integration for distributed training frameworks (PyTorch DistributedDataLoader, TensorFlow tf.data). The streaming architecture supports sharding across multiple workers/GPUs, automatic resumption from checkpoints, and memory-efficient iteration over the 360B token corpus.","intents":["Train models on large datasets without requiring terabytes of local storage","Distribute data loading across multiple GPUs/TPUs with minimal synchronization overhead","Resume training from checkpoints without re-downloading or re-processing data","Scale training to hundreds of GPUs with efficient data pipeline throughput"],"best_for":["Teams with distributed training infrastructure (multi-GPU, multi-node setups)","Cloud-based training environments (AWS, GCP, Azure) with limited persistent storage","Research groups training large models (7B+ parameters) where data I/O is a bottleneck","Organizations optimizing training cost by minimizing storage and network overhead"],"limitations":["Network latency from HuggingFace Hub can become a bottleneck for very large batch sizes or high-throughput training; local caching mitigates but adds complexity","Streaming assumes stable network connectivity; interruptions require resumption logic","No built-in support for dynamic batching or adaptive sampling based on loss; fixed iteration order only","Checkpoint resumption requires careful epoch/step tracking to avoid data skew across workers"],"requires":["HuggingFace datasets library (>=2.0.0) with streaming support","PyTorch (>=1.9.0) or TensorFlow (>=2.8.0) for distributed training","Python 3.8+","Stable internet connection (or local mirror for production)","Distributed training framework (torch.distributed, Hugging Face Accelerate, or equivalent)"],"input_types":["None — dataset is pre-loaded via HuggingFace API"],"output_types":["Batched token sequences (configurable batch size and sequence length)","Attention masks and position IDs (if preprocessing applied)","Metadata (source, document ID) if available in dataset schema"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-llm360--txt360__cap_3","uri":"capability://data.processing.analysis.reproducible.model.training.with.open.data.provenance","name":"reproducible model training with open data provenance","description":"TxT360 is part of the LLM360 initiative, which publishes not only the dataset but also training code, model checkpoints, and detailed documentation of the training process. This enables researchers to reproduce training runs, audit data usage, and understand exactly how models were built, supporting full transparency in foundation model development without proprietary black boxes.","intents":["Reproduce published model training results to verify claims and understand model behavior","Audit the exact data and hyperparameters used in a model to assess bias and quality","Build derivative models with modified data or training procedures while maintaining transparency","Publish research with verifiable, reproducible training pipelines"],"best_for":["Academic researchers requiring reproducibility for peer review and publication","Organizations with regulatory compliance needs (transparency, auditability)","Teams building on top of published models and needing to understand training details","Practitioners concerned about data bias and wanting to audit source composition"],"limitations":["Reproducibility depends on exact hardware, library versions, and random seeds; minor variations can produce different results at scale","Full training runs require significant compute resources (100s of GPUs); most teams cannot reproduce from scratch","Documentation quality varies; some training details may be missing or unclear","No guarantee that published checkpoints match the exact training procedure (potential drift over time)"],"requires":["Access to LLM360 GitHub repository or documentation for training code","HuggingFace datasets library (>=2.0.0)","PyTorch or TensorFlow (matching versions used in original training)","Python 3.8+","Significant compute resources for full reproduction (optional; can use published checkpoints instead)"],"input_types":["TxT360 dataset (via HuggingFace API)","Training hyperparameters and configuration files (published by LLM360)"],"output_types":["Trained model checkpoints (intermediate and final)","Training logs and metrics (loss curves, validation performance)","Reproducible model weights matching published baselines"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-llm360--txt360__cap_4","uri":"capability://data.processing.analysis.domain.balanced.text.sampling.for.model.evaluation","name":"domain-balanced text sampling for model evaluation","description":"TxT360's multi-source composition (web, books, academic) enables evaluation of model performance across diverse domains without requiring separate evaluation datasets. The corpus can be sampled to create domain-specific evaluation sets (e.g., 10% web, 30% books, 60% academic) that reflect real-world text distribution, supporting more realistic model capability assessment than single-domain benchmarks.","intents":["Evaluate model performance across diverse text domains to identify capability gaps","Create domain-balanced evaluation sets that reflect real-world text distribution","Compare model performance on web text vs. academic text vs. books to understand specialization","Assess generalization by testing on held-out data from the same sources used in training"],"best_for":["Researchers studying model generalization and domain transfer","Teams building domain-specific models and needing balanced evaluation","Practitioners assessing model robustness across diverse text types","Organizations evaluating models for production deployment across multiple domains"],"limitations":["No pre-built evaluation splits; requires custom sampling logic to create balanced evaluation sets","Domain labels may be coarse or missing; fine-grained domain classification not available","Evaluation on training data sources introduces potential data leakage if not carefully managed","No standard evaluation metrics or benchmarks published; teams must define their own evaluation protocols"],"requires":["HuggingFace datasets library (>=2.0.0)","Python 3.8+","Custom evaluation code to sample and balance domains","Understanding of model evaluation methodology (perplexity, downstream task performance, etc.)"],"input_types":["TxT360 dataset (via HuggingFace API)","Domain labels or source attribution (if available in metadata)"],"output_types":["Domain-balanced evaluation sets (text sequences or tokenized batches)","Evaluation metrics (perplexity, loss, downstream task performance per domain)"],"categories":["data-processing-analysis","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":22,"verified":false,"data_access_risk":"high","permissions":["HuggingFace datasets library (>=2.0.0)","Python 3.8+","Minimum 500GB disk space for local caching (optional but recommended)","HuggingFace account for authenticated access (free tier sufficient)","PyTorch or TensorFlow training framework compatible with HuggingFace integration","Understanding of corpus composition (metadata may be limited in public release)","HuggingFace datasets library (>=2.0.0) with streaming support","PyTorch (>=1.9.0) or TensorFlow (>=2.8.0) for distributed training","Stable internet connection (or local mirror for production)","Distributed training framework (torch.distributed, Hugging Face Accelerate, or equivalent)"],"failure_modes":["360B tokens is smaller than proprietary datasets (GPT-3 used ~300B, but with higher quality curation); may require supplementary domain-specific data for specialized tasks","English-only; no multilingual coverage limits applicability for non-English language models","No built-in data filtering for toxic, biased, or low-quality content — requires downstream curation","Streaming from HuggingFace Hub introduces network latency; local mirroring recommended for production training","No dynamic data augmentation or on-the-fly preprocessing; static snapshots only","Deduplication strategy not fully documented; unclear whether document-level or token-level dedup was prioritized","No public breakdown of source weights or filtering criteria applied per source","Deduplication may remove legitimate repetition (e.g., common phrasings in technical documentation) that aids model learning","No versioning or update mechanism; static snapshot limits ability to incorporate new high-quality sources","Network latency from HuggingFace Hub can become a bottleneck for very large batch sizes or high-throughput training; local caching mitigates but adds complexity","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.05,"quality":0.2,"ecosystem":0.45,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.3,"quality":0.25,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.764Z","last_scraped_at":"2026-05-03T14:22:48.064Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=llm360--txt360","compare_url":"https://unfragile.ai/compare?artifact=llm360--txt360"}},"signature":"K+lddRun1VYky2rNyJwVB3pVLXP/FqGDMmKF4zvcMTc9gf+8hBziSSciT6uwgXj5Q+125zy1D3vIwQGqPv3YAA==","signedAt":"2026-06-20T10:44:08.115Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/llm360--txt360","artifact":"https://unfragile.ai/llm360--txt360","verify":"https://unfragile.ai/api/v1/verify?slug=llm360--txt360","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}