{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-dataset-hf-doc-build--doc-build-dev","slug":"hf-doc-build--doc-build-dev","name":"doc-build-dev","type":"dataset","url":"https://huggingface.co/datasets/hf-doc-build/doc-build-dev","page_url":"https://unfragile.ai/hf-doc-build--doc-build-dev","categories":["model-training"],"tags":["license:mit","region:us","documentation"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-dataset-hf-doc-build--doc-build-dev__cap_0","uri":"capability://data.processing.analysis.documentation.corpus.fine.tuning.dataset.construction","name":"documentation-corpus fine-tuning dataset construction","description":"Provides a curated dataset of 271,754 documentation examples extracted from HuggingFace ecosystem repositories, structured for training language models on technical documentation generation and understanding. The dataset captures real-world documentation patterns, code examples, and API reference structures from production documentation builds, enabling models to learn documentation conventions, formatting, and technical accuracy patterns specific to ML/AI frameworks.","intents":["Train a language model to generate accurate technical documentation for ML libraries","Fine-tune models to understand and summarize API documentation patterns","Build documentation generation systems that match HuggingFace ecosystem conventions","Create models specialized in technical writing for open-source projects"],"best_for":["ML researchers training domain-specific documentation models","Teams building automated documentation generation systems","Open-source maintainers creating documentation assistants","Companies fine-tuning models for technical content generation"],"limitations":["Dataset is HuggingFace-ecosystem-specific; may not generalize to non-ML documentation domains","No version control history or temporal metadata; captures static documentation snapshots","Unknown filtering criteria for documentation quality; may include outdated or deprecated API references","No explicit train/validation/test splits provided; requires manual partitioning for model evaluation","Limited to English documentation; no multilingual variants"],"requires":["HuggingFace Datasets library (datasets>=2.0.0)","Python 3.7+","Sufficient disk space (~2-5GB depending on caching strategy)","Internet connection for initial dataset download from HuggingFace Hub"],"input_types":["documentation markdown/RST files","API reference documentation","code examples embedded in docs","structured metadata from documentation builds"],"output_types":["tokenized training examples","text sequences with documentation context","structured documentation-code pairs","preprocessed dataset splits for model training"],"categories":["data-processing-analysis","model-training-datasets"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-hf-doc-build--doc-build-dev__cap_1","uri":"capability://data.processing.analysis.documentation.code.example.pair.extraction","name":"documentation-code example pair extraction","description":"Extracts aligned pairs of documentation text and code examples from the dataset, preserving semantic relationships between explanatory prose and implementation snippets. Uses structured parsing to identify code blocks within documentation, associate them with surrounding context, and maintain bidirectional references between documentation sections and their corresponding code examples.","intents":["Create training data for code-documentation alignment models","Build systems that generate code examples from documentation descriptions","Train models to generate documentation from code snippets","Develop documentation-aware code completion systems"],"best_for":["Researchers training code-documentation alignment models","Teams building documentation-to-code or code-to-documentation systems","Developers creating intelligent code example retrieval systems","ML engineers training multimodal documentation understanding models"],"limitations":["Extraction quality depends on documentation structure consistency; poorly formatted docs may not parse correctly","Code examples may be incomplete snippets rather than runnable code; no validation of syntactic correctness","Language detection for code blocks may fail on polyglot examples or pseudocode","No explicit semantic alignment scoring; requires downstream validation of code-doc pair quality"],"requires":["HuggingFace Datasets library","Python 3.7+","Markdown/RST parsing libraries (e.g., markdown, docutils)","Optional: AST parsing libraries for code validation (ast, tree-sitter)"],"input_types":["markdown documentation with code fences","RST documentation with code blocks","inline code snippets","documentation metadata"],"output_types":["documentation-code example pairs","structured JSON with doc context and code","aligned text-code sequences for training","code block metadata (language, line numbers)"],"categories":["data-processing-analysis","code-generation-editing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-hf-doc-build--doc-build-dev__cap_2","uri":"capability://data.processing.analysis.documentation.build.artifact.dataset.versioning","name":"documentation-build artifact dataset versioning","description":"Maintains snapshots of documentation as generated by HuggingFace's build pipeline, capturing the exact state of rendered documentation at specific points in time. The dataset includes build metadata, timestamps, and source repository references, enabling reproducible access to historical documentation states and tracking how documentation evolves across versions.","intents":["Analyze how documentation changes across library versions","Train models on documentation from specific release cycles","Build systems that understand documentation evolution patterns","Create reproducible documentation analysis pipelines"],"best_for":["Researchers studying documentation quality evolution","Teams analyzing breaking changes in API documentation","Documentation systems that need version-aware retrieval","ML engineers training models on specific documentation snapshots"],"limitations":["Dataset captures only HuggingFace build outputs; no access to source repository history or commits","Temporal granularity unknown; may not have documentation for every release or commit","No explicit version tags or semantic versioning information; requires external mapping to library versions","Build metadata may be incomplete or inconsistent across different documentation sources"],"requires":["HuggingFace Datasets library","Python 3.7+","Understanding of HuggingFace documentation build structure","Optional: git or version control knowledge for correlating with source repositories"],"input_types":["documentation build artifacts","build metadata and timestamps","source repository references","version identifiers"],"output_types":["versioned documentation snapshots","build metadata with timestamps","documentation diffs across versions","version-tagged training examples"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-hf-doc-build--doc-build-dev__cap_3","uri":"capability://data.processing.analysis.multi.framework.documentation.pattern.learning","name":"multi-framework documentation pattern learning","description":"Aggregates documentation from multiple HuggingFace ecosystem libraries (transformers, datasets, diffusers, etc.) into a unified dataset, enabling models to learn common documentation patterns, conventions, and terminology across different frameworks. The dataset structure preserves framework-specific metadata while allowing cross-framework pattern extraction and generalization.","intents":["Train models to understand and generate documentation following HuggingFace conventions","Build documentation style transfer systems that adapt docs to HuggingFace standards","Create framework-agnostic documentation understanding models","Develop systems that identify and enforce documentation consistency across projects"],"best_for":["Teams maintaining multiple open-source ML libraries with consistent documentation","Researchers studying documentation patterns across ML frameworks","Documentation automation systems targeting HuggingFace ecosystem","ML engineers training models on multi-domain technical documentation"],"limitations":["Documentation patterns may be HuggingFace-specific and not generalizable to other ecosystems","Framework-specific terminology and conventions embedded in dataset; requires careful handling for transfer learning","No explicit framework labels or metadata for filtering; requires external mapping to identify framework-specific examples","Inconsistencies in documentation quality and style across different libraries may introduce noise"],"requires":["HuggingFace Datasets library","Python 3.7+","Knowledge of HuggingFace ecosystem structure and conventions","Optional: framework-specific parsing libraries for extracting framework-specific patterns"],"input_types":["documentation from transformers library","documentation from datasets library","documentation from diffusers library","documentation from other HuggingFace ecosystem projects","framework metadata and identifiers"],"output_types":["unified documentation examples","framework-tagged training data","cross-framework pattern annotations","documentation style embeddings"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-hf-doc-build--doc-build-dev__cap_4","uri":"capability://data.processing.analysis.documentation.to.api.schema.mapping","name":"documentation-to-api-schema mapping","description":"Correlates documentation text with underlying API schemas, function signatures, and parameter definitions extracted from source code or API specifications. The dataset maintains bidirectional mappings between documentation sections and their corresponding API elements, enabling models to learn how natural language documentation relates to formal API specifications and type information.","intents":["Train models to generate API documentation from function signatures","Build systems that validate documentation against actual API implementations","Create documentation-aware API exploration and discovery tools","Train models to understand parameter descriptions and type constraints from documentation"],"best_for":["Developers building API documentation generation systems","Teams creating documentation validation and linting tools","Researchers training models on API understanding and documentation","ML engineers building intelligent API documentation assistants"],"limitations":["API schema extraction quality depends on source code structure and documentation completeness","Mapping between documentation and API elements may be ambiguous or incomplete for complex APIs","No validation that documentation accurately reflects actual API behavior; may contain outdated or incorrect descriptions","Type information may be incomplete or missing for dynamically-typed code"],"requires":["HuggingFace Datasets library","Python 3.7+","AST parsing libraries (ast, tree-sitter) for extracting API schemas","Optional: type annotation parsing libraries (typing_inspect, typeshed)"],"input_types":["documentation text","API function signatures","parameter descriptions","type annotations","API schema definitions"],"output_types":["documentation-API mappings","parameter-description pairs","type-annotated documentation examples","API schema with documentation context"],"categories":["data-processing-analysis","code-generation-editing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-dataset-hf-doc-build--doc-build-dev__cap_5","uri":"capability://search.retrieval.documentation.search.and.retrieval.indexing","name":"documentation search and retrieval indexing","description":"Provides pre-indexed documentation corpus optimized for semantic search and retrieval tasks, with embeddings or dense vector representations of documentation sections. The dataset includes document boundaries, section hierarchies, and metadata enabling efficient retrieval of relevant documentation given queries or code context.","intents":["Build documentation search systems that understand semantic meaning rather than keyword matching","Create context-aware code completion systems that retrieve relevant documentation","Train retrieval-augmented generation systems for documentation-based question answering","Develop documentation recommendation systems for developers"],"best_for":["Teams building documentation search and discovery tools","Developers creating RAG systems for documentation question-answering","ML engineers training retrieval models for technical content","Companies building intelligent documentation assistants"],"limitations":["Pre-computed embeddings may use specific embedding models; may not be optimal for all downstream tasks","Document chunking strategy is fixed; may not align with optimal retrieval granularity for all use cases","No explicit relevance judgments or ground truth for retrieval evaluation","Embedding space may not capture domain-specific semantic relationships effectively"],"requires":["HuggingFace Datasets library","Python 3.7+","Vector database or similarity search library (faiss, annoy, or similar)","Optional: embedding model for computing or validating embeddings"],"input_types":["documentation text","document metadata","section hierarchies","pre-computed embeddings"],"output_types":["indexed documentation corpus","dense vector representations","retrieval results with relevance scores","document metadata and boundaries"],"categories":["search-retrieval","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":22,"verified":false,"data_access_risk":"high","permissions":["HuggingFace Datasets library (datasets>=2.0.0)","Python 3.7+","Sufficient disk space (~2-5GB depending on caching strategy)","Internet connection for initial dataset download from HuggingFace Hub","HuggingFace Datasets library","Markdown/RST parsing libraries (e.g., markdown, docutils)","Optional: AST parsing libraries for code validation (ast, tree-sitter)","Understanding of HuggingFace documentation build structure","Optional: git or version control knowledge for correlating with source repositories","Knowledge of HuggingFace ecosystem structure and conventions"],"failure_modes":["Dataset is HuggingFace-ecosystem-specific; may not generalize to non-ML documentation domains","No version control history or temporal metadata; captures static documentation snapshots","Unknown filtering criteria for documentation quality; may include outdated or deprecated API references","No explicit train/validation/test splits provided; requires manual partitioning for model evaluation","Limited to English documentation; no multilingual variants","Extraction quality depends on documentation structure consistency; poorly formatted docs may not parse correctly","Code examples may be incomplete snippets rather than runnable code; no validation of syntactic correctness","Language detection for code blocks may fail on polyglot examples or pseudocode","No explicit semantic alignment scoring; requires downstream validation of code-doc pair quality","Dataset captures only HuggingFace build outputs; no access to source repository history or commits","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.05,"quality":0.22,"ecosystem":0.38999999999999996,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.3,"quality":0.25,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.764Z","last_scraped_at":"2026-05-03T14:22:48.064Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=hf-doc-build--doc-build-dev","compare_url":"https://unfragile.ai/compare?artifact=hf-doc-build--doc-build-dev"}},"signature":"gXyVEK4Hf6px0OHoF+6GZ2XU39+Ma63n8PE5Te6kKe5ZVyLySsvOAWUJ9HS0qpPRliGIzu7uZbgEwTSWKKBlDg==","signedAt":"2026-06-22T06:36:00.409Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/hf-doc-build--doc-build-dev","artifact":"https://unfragile.ai/hf-doc-build--doc-build-dev","verify":"https://unfragile.ai/api/v1/verify?slug=hf-doc-build--doc-build-dev","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}