{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"redpajama-v2","slug":"redpajama-v2","name":"RedPajama v2","type":"dataset","url":"https://together.ai/blog/redpajama-data-v2","page_url":"https://unfragile.ai/redpajama-v2","categories":["model-training","documentation","testing-quality"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"redpajama-v2__cap_0","uri":"capability://data.processing.analysis.multi.language.web.scale.document.collection.with.40.quality.annotations","name":"multi-language web-scale document collection with 40+ quality annotations","description":"Aggregates 100+ billion deduplicated documents (30 trillion tokens) from 84 CommonCrawl dumps across 5 languages (English, German, French, Spanish, Italian). Each document is pre-annotated with 40+ quality signals including perplexity scores, deduplication hashes, content classifiers, and toxicity ratings computed via a standardized pipeline. The architecture processes raw CommonCrawl HTML through text extraction, deduplication, and multi-dimensional quality scoring, enabling downstream users to apply custom filtering strategies without reprocessing the raw data.","intents":["I need a massive, reproducible web corpus to train foundation models without building my own data pipeline","I want to study how different quality signals correlate with downstream model performance","I need to compare data curation strategies across multiple filtering thresholds on the same base dataset","I want multilingual training data with consistent annotation methodology across languages"],"best_for":["LLM researchers training foundation models at scale","organizations studying data curation and filtering strategies","teams building open-source language models across multiple languages","data scientists analyzing web content quality distributions"],"limitations":["Web-only source (CommonCrawl) inherits web biases, spam, and low-quality content; requires downstream filtering to achieve production quality","40+ quality signals are pre-computed but specific signal definitions and validation methodology not publicly documented","No domain-specific data (code, scientific papers, books) — coverage limited to web content","5 languages only; no coverage for non-Latin scripts or low-resource languages","Raw data (100+ trillion tokens) is 3.3× larger than processed data, indicating significant filtering already applied; original filtering thresholds not transparent","No temporal metadata or freshness guarantees for CommonCrawl dumps"],"requires":["Hugging Face account for dataset access","Storage capacity for 30 trillion tokens (exact storage requirements not publicly specified, but likely terabytes)","Data loading library (HuggingFace datasets library or equivalent)","Python 3.7+ for processing and filtering scripts","Understanding of quality signal interpretation and data curation methodology"],"input_types":["CommonCrawl HTML dumps (84 dumps, 100+ trillion raw tokens)"],"output_types":["deduplicated text documents with structured metadata","quality signal annotations (perplexity, toxicity, content classifiers, deduplication hashes)","filtered subsets based on custom quality thresholds"],"categories":["data-processing-analysis","model-training"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_1","uri":"capability://data.processing.analysis.document.level.deduplication.with.hash.based.matching","name":"document-level deduplication with hash-based matching","description":"Implements deduplication across 100+ billion documents using hash-based matching to identify and remove duplicate content from CommonCrawl. The pipeline computes deduplication hashes for each document and filters the raw 100+ trillion token corpus down to 30 trillion deduplicated tokens. This approach preserves document boundaries (unlike token-level deduplication) and produces deterministic, reproducible results across reprocessing runs.","intents":["I need to remove duplicate web content before training to avoid data leakage and redundancy","I want reproducible deduplication that I can verify or reapply to new data","I need to understand which documents are duplicates across CommonCrawl dumps"],"best_for":["LLM training teams concerned with data quality and training efficiency","researchers studying the impact of deduplication on model performance","organizations building custom datasets from CommonCrawl"],"limitations":["Deduplication algorithm details (exact vs. fuzzy matching, hash function, collision handling) not publicly documented","No information on deduplication accuracy, false positive/negative rates, or sensitivity to minor text variations","Document-level deduplication may miss near-duplicate content or paraphrased versions","Deduplication hashes are pre-computed; users cannot apply custom deduplication strategies without reprocessing"],"requires":["Access to RedPajama v2 dataset on Hugging Face","Understanding of hash-based deduplication methodology","Ability to parse and filter documents by deduplication hash"],"input_types":["raw CommonCrawl documents (100+ billion documents, 100+ trillion tokens)"],"output_types":["deduplicated document set (100+ billion documents, 30 trillion tokens)","deduplication hashes per document"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_10","uri":"capability://data.processing.analysis.free.and.open.source.corpus.access","name":"free and open-source corpus access","description":"Provides the entire 30 trillion token corpus, processing scripts, and quality annotations as free, open-source resources with no licensing restrictions. Users can download, modify, redistribute, and use the data for any purpose including commercial applications. This open approach enables broad research access and community-driven improvements without vendor lock-in.","intents":["I want to use a large pretraining corpus without paying licensing fees","I need to build a commercial model without data licensing restrictions","I want to modify and redistribute the corpus for my community","I need to ensure reproducibility by using openly available data"],"best_for":["academic researchers with limited budgets","startups and small teams building commercial models","organizations in countries with restricted data access","open-source projects and community initiatives"],"limitations":["Free distribution means no commercial support or SLA guarantees","No liability or warranty — users assume all risk for data quality and legal compliance","Open-source license may have restrictions on derivative works (depends on specific license)","No guaranteed availability or uptime for HuggingFace distribution","Community support only — no dedicated support team for issues or questions","Users responsible for compliance with local data regulations (GDPR, CCPA, etc.)"],"requires":["HuggingFace account (free)","Understanding of open-source licensing terms","Compliance with applicable data regulations in your jurisdiction"],"input_types":["none — data is freely available"],"output_types":["30 trillion token corpus","processing scripts","quality annotations"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_2","uri":"capability://data.processing.analysis.perplexity.based.quality.scoring.for.language.model.fitness","name":"perplexity-based quality scoring for language model fitness","description":"Computes perplexity scores for each document using a reference language model, enabling quantitative assessment of text quality and language model fitness. The perplexity metric measures how well a pre-trained model predicts the document; lower perplexity indicates higher-quality, more coherent text. These pre-computed scores allow users to filter documents by quality threshold without running inference themselves, and to study the relationship between perplexity and downstream model performance.","intents":["I want to filter documents by language model fitness without computing perplexity myself","I need to understand the distribution of text quality across the web corpus","I want to study how perplexity thresholds affect downstream model performance"],"best_for":["LLM researchers optimizing data quality for training","teams studying the relationship between document quality metrics and model performance","practitioners filtering web data for production training runs"],"limitations":["Perplexity scores are computed using an unspecified reference model; different reference models produce different scores","No information on which reference model was used, its training data, or its language coverage","Perplexity is a proxy for quality but does not capture semantic relevance, factuality, or domain-specific fitness","Pre-computed scores cannot be updated if reference model changes; users cannot apply custom perplexity models"],"requires":["Access to RedPajama v2 dataset with pre-computed perplexity scores","Understanding of perplexity as a quality metric and its limitations","Ability to filter documents by perplexity threshold"],"input_types":["text documents from RedPajama v2"],"output_types":["perplexity scores (numeric, per document)","filtered document subsets based on perplexity thresholds"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_3","uri":"capability://data.processing.analysis.content.classification.and.toxicity.annotation.across.documents","name":"content classification and toxicity annotation across documents","description":"Annotates each document with content classifiers and toxicity ratings, enabling category-based filtering and safety-aware data curation. The pipeline applies pre-trained classifiers to categorize document content (e.g., news, forums, documentation) and compute toxicity scores. These annotations are pre-computed and stored with each document, allowing users to filter by content type or toxicity threshold without running inference themselves.","intents":["I want to filter out toxic or harmful content before training","I need to understand the distribution of content types in the web corpus","I want to train models on specific content categories (e.g., news, documentation) without manual labeling"],"best_for":["teams building production LLMs with safety requirements","researchers studying the impact of content filtering on model behavior","organizations curating domain-specific training datasets from web data"],"limitations":["Specific content classifiers and toxicity detection models not documented; unclear what categories are supported","No information on classifier accuracy, false positive/negative rates, or validation methodology","Toxicity is subjective and culturally dependent; pre-computed scores may not align with user values","Content classifiers may have biases or poor performance on non-English content","Pre-computed annotations cannot be updated or customized; users cannot apply alternative classifiers"],"requires":["Access to RedPajama v2 dataset with pre-computed content classifiers and toxicity scores","Understanding of content categories and toxicity metrics","Ability to filter documents by content type or toxicity threshold"],"input_types":["text documents from RedPajama v2"],"output_types":["content classification labels (category per document)","toxicity scores (numeric, per document)","filtered document subsets based on content type or toxicity threshold"],"categories":["data-processing-analysis","safety-moderation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_4","uri":"capability://data.processing.analysis.open.source.reproducible.data.processing.pipeline","name":"open-source reproducible data processing pipeline","description":"Publishes end-to-end processing scripts on GitHub that convert raw CommonCrawl HTML to deduplicated, annotated documents. The pipeline is fully open-source, enabling users to understand, verify, and reproduce the data processing methodology. Scripts handle HTML-to-text conversion, deduplication, quality signal computation, and filtering, allowing researchers to reprocess data with custom parameters or apply the same methodology to new CommonCrawl dumps.","intents":["I want to understand exactly how the data was processed and verify the methodology","I need to reprocess CommonCrawl with custom parameters or apply the same pipeline to new dumps","I want to audit the data processing for biases or errors"],"best_for":["LLM researchers prioritizing reproducibility and transparency","teams building custom datasets using the same methodology","organizations auditing data processing for compliance or bias"],"limitations":["GitHub repository URL not provided in documentation; requires searching for 'RedPajama' on GitHub","No version pinning or reproducibility guarantees mentioned; scripts may change over time","Computational requirements for reprocessing 84 CommonCrawl dumps not documented; likely prohibitively expensive for most users","HTML-to-text conversion artifacts acknowledged but not detailed; conversion quality may vary across documents","Scripts may require specific Python versions, dependencies, or compute infrastructure not documented"],"requires":["GitHub access to RedPajama v2 repository","Python 3.7+ with required dependencies (not specified)","Significant compute resources to reprocess CommonCrawl dumps (cost/time not documented)","Understanding of CommonCrawl data format and structure"],"input_types":["CommonCrawl WARC files (raw HTML dumps)"],"output_types":["deduplicated text documents","quality signal annotations","processing logs and statistics"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_5","uri":"capability://data.processing.analysis.fine.grained.data.curation.via.quality.signal.filtering","name":"fine-grained data curation via quality signal filtering","description":"Enables users to apply custom filtering strategies by combining 40+ pre-computed quality signals (perplexity, toxicity, content classifiers, deduplication hashes, etc.). Rather than providing pre-filtered 'ready-to-train' datasets, RedPajama v2 provides the raw signals and lets users define their own filtering logic. This architecture supports comparative studies of curation strategies and enables organizations to apply domain-specific or value-aligned filtering without reprocessing the base dataset.","intents":["I want to experiment with different quality thresholds to find the optimal filtering strategy","I need to apply custom filtering based on my organization's values or domain requirements","I want to study how different curation strategies affect downstream model performance"],"best_for":["LLM researchers studying data curation methodology","teams optimizing data quality for specific use cases","organizations with custom filtering requirements (domain-specific, safety-aligned, etc.)"],"limitations":["Requires users to implement their own filtering logic; no pre-filtered 'ready-to-train' datasets provided","40+ quality signals are pre-computed but specific signal definitions not documented; users must infer signal semantics from usage","No guidance on which signal combinations are most effective or how signals interact","Filtering logic is user-defined; no validation or best-practice recommendations provided","No built-in tools for filtering, analysis, or visualization; users must implement custom analysis"],"requires":["Access to RedPajama v2 dataset with all 40+ quality signals","Data processing tools (Python, Pandas, or equivalent) to implement filtering logic","Understanding of quality signals and their interpretation","Compute resources to load and filter large subsets of the dataset"],"input_types":["RedPajama v2 documents with 40+ quality signal annotations"],"output_types":["filtered document subsets based on custom quality thresholds","analysis of quality signal distributions and filtering impact"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_6","uri":"capability://data.processing.analysis.multilingual.web.corpus.with.consistent.annotation.across.5.languages","name":"multilingual web corpus with consistent annotation across 5 languages","description":"Provides 30 trillion tokens across 5 languages (English, German, French, Spanish, Italian) with consistent quality signal annotations applied uniformly across all languages. The architecture processes each language through the same deduplication, quality scoring, and classification pipeline, enabling comparative studies of language-specific data characteristics and training multilingual models on a standardized base dataset. Language-specific processing details are not documented, but the consistent annotation methodology enables cross-language analysis.","intents":["I want to train multilingual models on a large, standardized corpus without building separate pipelines per language","I need to study how data quality varies across languages","I want to compare curation strategies across multiple languages"],"best_for":["teams training multilingual foundation models","researchers studying language-specific data quality and biases","organizations building models for European languages"],"limitations":["5 languages only (English, German, French, Spanish, Italian); no coverage for non-Latin scripts, Asian languages, or low-resource languages","Language-specific processing details not documented; unclear how language detection, filtering, or normalization is handled","No information on language distribution (token count per language, document count per language)","Quality signals computed uniformly across languages; may not capture language-specific quality issues","No domain-specific data per language (code, scientific papers, etc.)"],"requires":["Access to RedPajama v2 dataset with multilingual documents","Ability to filter documents by language","Understanding of language-specific data characteristics and biases"],"input_types":["CommonCrawl documents in 5 languages (English, German, French, Spanish, Italian)"],"output_types":["deduplicated, annotated documents per language","language-specific quality signal distributions","filtered subsets per language or language-balanced subsets"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_7","uri":"capability://data.processing.analysis.commoncrawl.scale.data.aggregation.from.84.dumps","name":"commoncrawl-scale data aggregation from 84 dumps","description":"Aggregates data from 84 CommonCrawl dumps (100+ trillion raw tokens) into a single, deduplicated, consistently-annotated dataset. The architecture handles the complexity of processing massive-scale web data including deduplication across dumps, consistent quality signal computation, and language-specific filtering. This enables users to work with a unified, large-scale web corpus without managing multiple CommonCrawl dumps or implementing their own aggregation pipeline.","intents":["I want to train on massive-scale web data without managing 84 separate CommonCrawl dumps","I need a unified dataset for comparative studies across multiple web snapshots","I want to understand the scale and characteristics of web data at CommonCrawl scale"],"best_for":["organizations training large foundation models requiring massive data scale","researchers studying web data characteristics and biases at scale","teams building open-source LLMs with reproducible data sources"],"limitations":["Raw data (100+ trillion tokens) is 3.3× larger than processed data (30 trillion), indicating significant filtering; original filtering thresholds and rationale not transparent","No information on which CommonCrawl dumps are included, their temporal coverage, or how dumps are selected","Web-only source inherits web biases (overrepresentation of English, technical content, etc.)","No domain-specific data (code, scientific papers, books); coverage limited to web content","Computational requirements for processing 84 dumps not documented; likely prohibitively expensive for most users to reprocess"],"requires":["Access to RedPajama v2 dataset on Hugging Face","Storage capacity for 30 trillion tokens (exact requirements not specified)","Data loading and processing tools (HuggingFace datasets library or equivalent)"],"input_types":["84 CommonCrawl WARC dumps (100+ trillion raw tokens)"],"output_types":["unified, deduplicated dataset (30 trillion tokens)","quality signal annotations per document","language-specific subsets"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_8","uri":"capability://data.processing.analysis.open.source.processing.pipeline.and.transparency","name":"open-source processing pipeline and transparency","description":"Publishes processing scripts on GitHub enabling users to understand, validate, and extend the data processing pipeline. Scripts cover HTML-to-text conversion, deduplication, quality signal computation, and filtering. This transparency enables reproducible research, allows users to apply custom modifications, and supports community contributions. Users can inspect the exact methodology used for corpus creation and adapt it for their own data sources.","intents":["I want to understand exactly how the corpus was processed to validate methodology","I need to apply the same processing pipeline to my own data sources","I want to modify the processing pipeline for custom quality signals or filtering","I need to contribute improvements or bug fixes to the processing code"],"best_for":["researchers validating data processing methodology","teams applying RedPajama processing to custom data sources","developers extending or modifying the processing pipeline","open-source contributors improving data quality"],"limitations":["Processing scripts require significant compute resources to execute — not practical for most users to reprocess full corpus","Documentation of processing scripts likely incomplete — users may need to read code to understand methodology","Scripts may have dependencies on specific libraries or infrastructure not available to all users","Modifications to processing pipeline require substantial engineering effort and validation","No automated testing or validation framework provided for custom modifications","Processing pipeline execution time measured in weeks — limits ability to experiment with modifications"],"requires":["Python 3.7+ and data processing libraries (Spark, Dask, or similar)","Access to GitHub repository and familiarity with Git","Understanding of data processing pipelines and distributed computing","Substantial compute infrastructure to execute processing scripts","Familiarity with HTML parsing, text processing, and deduplication algorithms"],"input_types":["processing scripts from GitHub","CommonCrawl dumps or custom data sources","configuration files for processing parameters"],"output_types":["processed and deduplicated text corpus","quality signal annotations","processing logs and statistics"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__cap_9","uri":"capability://tool.use.integration.huggingface.dataset.distribution.and.streaming","name":"huggingface dataset distribution and streaming","description":"Distributes the 30 trillion token corpus via HuggingFace Datasets, enabling users to download, stream, or access subsets without managing raw files directly. HuggingFace integration provides standardized data loading APIs compatible with PyTorch, TensorFlow, and other ML frameworks. Users can load documents with quality annotations, apply filters, and create training dataloaders with minimal code.","intents":["I want to load RedPajama data into my training pipeline with minimal code","I need to stream data from HuggingFace rather than downloading the full 30 trillion token corpus","I want to use standard PyTorch DataLoader with RedPajama data","I need to access specific language subsets or filtered versions through HuggingFace"],"best_for":["ML engineers integrating RedPajama into training pipelines","teams with limited storage but access to HuggingFace streaming","researchers using standard PyTorch/TensorFlow workflows","organizations building on HuggingFace ecosystem"],"limitations":["Streaming from HuggingFace requires stable internet connection — not suitable for offline training","Streaming bandwidth may be bottleneck for large-scale training — local storage often faster","HuggingFace API changes may break compatibility with older code","No built-in support for custom filtering or quality signal application in HuggingFace integration","Dataset size (30 trillion tokens) may exceed HuggingFace storage limits or require special handling","Streaming performance depends on HuggingFace infrastructure availability and network conditions"],"requires":["HuggingFace account and datasets library (pip install datasets)","Python 3.7+ and PyTorch or TensorFlow","Stable internet connection for streaming","Familiarity with HuggingFace Datasets API"],"input_types":["HuggingFace dataset identifiers and configuration"],"output_types":["PyTorch DataLoader or TensorFlow tf.data.Dataset","document batches with quality annotations","filtered or language-specific subsets"],"categories":["tool-use-integration","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"redpajama-v2__headline","uri":"capability://data.processing.analysis.large.scale.annotated.dataset.for.llm.training","name":"large-scale annotated dataset for llm training","description":"RedPajama v2 is a massive, open-source dataset containing 30 trillion tokens and over 100 billion documents, specifically designed for training large language models with extensive quality annotations for data curation.","intents":["best dataset for LLM training","large-scale dataset for NLP research","annotated dataset for language models","free dataset for machine learning","multi-language dataset for AI"],"best_for":["researchers","AI practitioners"],"limitations":["data quality variability","static dataset"],"requires":["computational resources"],"input_types":["text documents"],"output_types":["training data for LLMs"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":60,"verified":false,"data_access_risk":"high","permissions":["Hugging Face account for dataset access","Storage capacity for 30 trillion tokens (exact storage requirements not publicly specified, but likely terabytes)","Data loading library (HuggingFace datasets library or equivalent)","Python 3.7+ for processing and filtering scripts","Understanding of quality signal interpretation and data curation methodology","Access to RedPajama v2 dataset on Hugging Face","Understanding of hash-based deduplication methodology","Ability to parse and filter documents by deduplication hash","HuggingFace account (free)","Understanding of open-source licensing terms"],"failure_modes":["Web-only source (CommonCrawl) inherits web biases, spam, and low-quality content; requires downstream filtering to achieve production quality","40+ quality signals are pre-computed but specific signal definitions and validation methodology not publicly documented","No domain-specific data (code, scientific papers, books) — coverage limited to web content","5 languages only; no coverage for non-Latin scripts or low-resource languages","Raw data (100+ trillion tokens) is 3.3× larger than processed data, indicating significant filtering already applied; original filtering thresholds not transparent","No temporal metadata or freshness guarantees for CommonCrawl dumps","Deduplication algorithm details (exact vs. fuzzy matching, hash function, collision handling) not publicly documented","No information on deduplication accuracy, false positive/negative rates, or sensitivity to minor text variations","Document-level deduplication may miss near-duplicate content or paraphrased versions","Deduplication hashes are pre-computed; users cannot apply custom deduplication strategies without reprocessing","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.9,"ecosystem":0.49999999999999994,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.3,"quality":0.25,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:25.061Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=redpajama-v2","compare_url":"https://unfragile.ai/compare?artifact=redpajama-v2"}},"signature":"qP52wz0cEEx8TF5huzGQH0pvz7CtjxZbkGg3kBjL8YjOgpMl1HNQBKmJz83AmWXBINxYlmWdpayiOFknboXAAw==","signedAt":"2026-06-19T20:21:36.237Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/redpajama-v2","artifact":"https://unfragile.ai/redpajama-v2","verify":"https://unfragile.ai/api/v1/verify?slug=redpajama-v2","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}