{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"codesearchnet","slug":"codesearchnet","name":"CodeSearchNet","type":"dataset","url":"https://huggingface.co/datasets/code_search_net","page_url":"https://unfragile.ai/codesearchnet","categories":["model-training","testing-quality"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"codesearchnet__cap_0","uri":"capability://data.processing.analysis.multi.language.code.documentation.pair.extraction.and.indexing","name":"multi-language code-documentation pair extraction and indexing","description":"Extracts 6 million function-docstring pairs from public GitHub repositories across Python, Java, JavaScript, PHP, Ruby, and Go using AST parsing and heuristic matching to align code blocks with their associated natural language documentation. The dataset structures these pairs with metadata (repository, file path, function signature) enabling large-scale supervised training of code understanding models. Implementation uses language-specific parsers to identify function boundaries and docstring conventions (docstrings, JSDoc, Javadoc, etc.) with fuzzy matching to handle inconsistent documentation patterns.","intents":["Train neural code search models that understand semantic relationships between code and natural language queries","Build code understanding models that can generate or predict documentation from source code","Evaluate whether a code search system correctly ranks relevant functions given natural language queries","Create embeddings that map code and documentation into a shared semantic space for retrieval tasks"],"best_for":["ML researchers training code understanding models (CodeBERT, GraphCodeBERT, UniXcoder variants)","Teams building production code search systems who need a standardized benchmark for evaluation","Organizations fine-tuning pre-trained models on domain-specific code repositories"],"limitations":["Dataset is static snapshot from GitHub circa 2019 — does not reflect modern code patterns, frameworks, or language versions","Docstring quality varies significantly; many functions have minimal or auto-generated documentation, introducing noise for training","Extraction heuristics may misalign function boundaries with docstrings in edge cases (nested functions, decorators, complex inheritance)","Skewed language distribution — Python and Java dominate; Ruby and PHP are underrepresented relative to real-world usage","No temporal information — cannot track how code and documentation evolved or diverged over time","Extraction process does not capture context beyond individual functions (imports, class definitions, module-level state)"],"requires":["Hugging Face Datasets library (datasets>=2.0.0)","Python 3.7+","Minimum 50GB disk space for full dataset download (compressed: ~10GB)","Internet connection for initial dataset download from Hugging Face Hub"],"input_types":["GitHub repository metadata (owner, name, commit hash)","Raw source code files in Python, Java, JavaScript, PHP, Ruby, Go"],"output_types":["Structured JSON/Parquet records with fields: code (function source), docstring (natural language), language, repo, path, url","Embeddings (when used with embedding models like CodeBERT)"],"categories":["data-processing-analysis","code-understanding-benchmark"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"codesearchnet__cap_1","uri":"capability://data.processing.analysis.code.search.benchmark.with.relevance.ranking.evaluation","name":"code search benchmark with relevance ranking evaluation","description":"Provides a standardized evaluation protocol where code search systems are scored on their ability to rank relevant functions highly when given natural language queries. The benchmark includes query-function pairs with relevance labels derived from the original docstring-code alignment, enabling metrics like Mean Reciprocal Rank (MRR), Normalized Discounted Cumulative Gain (NDCG), and recall@k. Evaluation is performed by computing similarity between query embeddings and code embeddings, then ranking functions by score and comparing against ground-truth relevant functions.","intents":["Measure how well a code search model ranks relevant functions given a natural language query","Compare performance of different embedding models or retrieval architectures on a standardized benchmark","Identify failure modes in code search systems (e.g., queries that consistently rank irrelevant functions highly)","Track improvements in code search quality as models evolve or are fine-tuned on domain data"],"best_for":["Researchers publishing code search papers who need a reproducible benchmark","Teams evaluating commercial or open-source code search tools (GitHub Copilot, Tabnine, Kite, etc.)","ML engineers tuning retrieval hyperparameters (embedding dimension, similarity metric, re-ranking strategy)"],"limitations":["Relevance labels are binary (relevant/irrelevant) derived from docstring-code pairs; does not capture partial relevance or semantic similarity gradations","Query distribution is synthetic (derived from docstrings) and may not reflect real user search behavior or intent","Benchmark does not evaluate code search on tasks like bug-finding, security vulnerability detection, or code clone detection","No evaluation of latency, throughput, or scalability — purely accuracy-focused metrics","Evaluation assumes single-function retrieval; does not test multi-file or cross-repository code search"],"requires":["Code embedding model (CodeBERT, GraphCodeBERT, or custom trained model)","Query embeddings (either pre-computed or generated on-the-fly)","Evaluation script to compute ranking metrics (provided in dataset repository or custom implementation)","Python 3.7+"],"input_types":["Natural language queries (derived from docstrings or custom queries)","Code embeddings (dense vectors from a code understanding model)","Function embeddings (dense vectors from same model)"],"output_types":["Ranking metrics: MRR, NDCG@k, recall@k, precision@k","Per-query performance breakdown (which queries are hard/easy)","Ranked lists of functions for each query"],"categories":["data-processing-analysis","search-retrieval"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"codesearchnet__cap_2","uri":"capability://data.processing.analysis.language.specific.function.boundary.detection.and.extraction","name":"language-specific function boundary detection and extraction","description":"Implements language-specific AST parsing and heuristic-based extraction to identify function definitions and their associated docstrings across 6 programming languages. For each language, the extraction pipeline uses language-specific conventions: Python (docstrings via triple quotes), Java (Javadoc comments), JavaScript (JSDoc), PHP (PHPDoc), Ruby (YARD/RDoc), and Go (comment blocks). The system handles edge cases like nested functions, decorators, type annotations, and multi-line signatures by leveraging language-specific syntax rules and comment parsing.","intents":["Extract clean function-docstring pairs from raw GitHub repositories for training data preparation","Normalize code across languages into a consistent format for multi-language model training","Handle language-specific documentation conventions (docstrings vs comments) without manual annotation","Scale extraction to millions of functions across diverse codebases with varying code quality and style"],"best_for":["Data engineers building training datasets for code understanding models","Researchers studying code documentation patterns across programming languages","Teams building automated code documentation or code-to-comment generation systems"],"limitations":["Extraction heuristics are imperfect — may miss functions with non-standard documentation or misalign docstrings in complex files","Does not handle code generation or templating (e.g., macros in C, generics in Java) — extracts literal source only","Language-specific parsers may fail on syntax errors or non-standard code patterns (e.g., DSLs embedded in Python)","Nested functions and closures are sometimes incorrectly attributed to parent functions","Extraction does not preserve semantic context (imports, class hierarchy, module state) — functions are extracted in isolation"],"requires":["Language-specific AST parsers (tree-sitter or language-native parsers)","Python 3.7+","Sufficient memory to parse large files (some GitHub files exceed 100KB)"],"input_types":["Raw source code files (.py, .java, .js, .php, .rb, .go)","Repository metadata (file paths, commit hashes)"],"output_types":["Structured records: {code: string, docstring: string, language: string, signature: string, repo: string, path: string}","Parquet or JSON files for downstream processing"],"categories":["data-processing-analysis","code-generation-editing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"codesearchnet__cap_3","uri":"capability://memory.knowledge.pre.computed.code.and.query.embeddings.for.rapid.model.evaluation","name":"pre-computed code and query embeddings for rapid model evaluation","description":"Provides pre-computed dense vector embeddings for all 6 million functions and associated queries using CodeBERT or similar models, enabling researchers to evaluate new ranking or retrieval strategies without re-embedding the entire dataset. Embeddings are stored in a format optimized for similarity search (e.g., FAISS-compatible vectors), allowing fast nearest-neighbor lookup and ranking without loading the full model. This capability abstracts away the computational cost of embedding generation, making the benchmark accessible to researchers without GPU resources.","intents":["Quickly evaluate new retrieval or ranking algorithms without the overhead of re-embedding 6M functions","Benchmark code search systems on a standardized embedding space, ensuring fair comparison across methods","Enable researchers without GPU access to participate in code search research","Prototype and iterate on code search architectures (e.g., re-ranking, query expansion) in hours rather than days"],"best_for":["Researchers iterating on retrieval algorithms (re-ranking, query expansion, dense-sparse hybrid search)","Teams with limited GPU budgets who need to evaluate multiple models quickly","Practitioners building production code search systems who want to benchmark against a standard"],"limitations":["Pre-computed embeddings are fixed to a specific model (e.g., CodeBERT) — cannot evaluate models with different embedding spaces without re-embedding","Embeddings are static and do not update as code or documentation evolves in real repositories","Storage overhead is significant (~50GB for 6M embeddings at 768 dimensions) — requires substantial disk space","Pre-computed embeddings may not be optimal for domain-specific code (e.g., medical, financial) — fine-tuning on domain data requires re-embedding","Similarity search is limited to the embedding space of the pre-computed model — cannot evaluate semantic search on custom similarity metrics"],"requires":["Disk space for embeddings (~50GB for full dataset)","FAISS or similar vector search library for efficient nearest-neighbor lookup","Python 3.7+","Optional: GPU for faster similarity search (CPU search is feasible but slower)"],"input_types":["Query embeddings (768-dimensional vectors from CodeBERT or compatible model)","Function embeddings (768-dimensional vectors)"],"output_types":["Ranked lists of functions (indices and similarity scores)","Evaluation metrics (MRR, NDCG@k, recall@k)"],"categories":["memory-knowledge","search-retrieval"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"codesearchnet__cap_4","uri":"capability://data.processing.analysis.train.test.split.with.language.stratified.sampling","name":"train-test split with language-stratified sampling","description":"Provides standardized train/test/validation splits of the 6 million function-docstring pairs with stratification by programming language to ensure balanced representation across languages in each split. The split strategy maintains the distribution of languages (Python, Java, JavaScript, PHP, Ruby, Go) across train/test sets, preventing models from overfitting to language-specific patterns or achieving inflated performance on high-resource languages. Splits are deterministic and reproducible, enabling fair comparison across research papers and implementations.","intents":["Train code understanding models on a standardized training set without data leakage or distribution shift","Evaluate models on a held-out test set that reflects the language distribution of the full dataset","Compare results across papers and implementations using identical train/test splits","Analyze model performance per language to identify language-specific strengths and weaknesses"],"best_for":["ML researchers publishing code search papers who need reproducible, standardized splits","Teams training code understanding models who want to avoid data leakage and ensure fair evaluation","Practitioners comparing multiple models or architectures on the same benchmark"],"limitations":["Language-stratified sampling ensures balanced representation but may not reflect real-world code search queries (e.g., Python queries may be more common than Go)","Train/test split is static — does not account for temporal distribution of code (newer code may have different patterns)","No explicit handling of repository-level leakage — functions from the same repository may appear in both train and test sets","Stratification by language only — does not stratify by code complexity, documentation quality, or other factors that may affect model performance","Split sizes are fixed — researchers cannot easily create alternative splits for cross-validation or sensitivity analysis"],"requires":["Hugging Face Datasets library (datasets>=2.0.0)","Python 3.7+","Deterministic random seed for reproducibility"],"input_types":["Full dataset of 6M function-docstring pairs with language labels"],"output_types":["Train split (~80% of data, ~4.8M pairs)","Test split (~10% of data, ~600K pairs)","Validation split (~10% of data, ~600K pairs)","Per-language split statistics (e.g., Python: 40% of train, 40% of test)"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"codesearchnet__cap_5","uri":"capability://data.processing.analysis.github.repository.metadata.and.provenance.tracking","name":"github repository metadata and provenance tracking","description":"Includes rich metadata for each function-docstring pair: repository owner, repository name, file path, commit hash, and GitHub URL. This metadata enables researchers to trace extracted functions back to their original source, verify data quality, and analyze code search performance by repository characteristics (e.g., popularity, age, language). The provenance information supports reproducibility and allows researchers to filter or analyze subsets of the dataset based on repository properties (e.g., only functions from popular repositories, or only recent commits).","intents":["Verify data quality by inspecting original functions in their GitHub context","Analyze code search performance by repository characteristics (e.g., does the model perform better on popular repositories?)","Filter the dataset to focus on specific repositories, languages, or time periods","Reproduce the dataset extraction process or audit the extraction pipeline for errors"],"best_for":["Researchers auditing dataset quality or analyzing model performance by data source","Teams building domain-specific code search systems who want to filter to relevant repositories","Data scientists analyzing biases in code understanding models (e.g., do models perform better on popular repositories?)"],"limitations":["Metadata is static — does not reflect current state of repositories (code may have changed since extraction)","GitHub URLs may become invalid if repositories are deleted or made private","Commit hashes enable reproducibility but require access to GitHub API or local clones to verify","Metadata does not include repository metadata like stars, forks, or activity level — requires additional GitHub API calls","No information about code licensing or usage rights — researchers must verify licensing independently"],"requires":["GitHub API access (optional, for verifying metadata or fetching additional repository information)","Python 3.7+"],"input_types":["Function-docstring pairs with associated metadata"],"output_types":["Structured metadata: {repo_owner: string, repo_name: string, file_path: string, commit_hash: string, github_url: string}","Filtered subsets of the dataset based on metadata queries"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"codesearchnet__cap_6","uri":"capability://data.processing.analysis.multi.language.code.normalization.and.standardization","name":"multi-language code normalization and standardization","description":"Applies language-specific normalization rules to code snippets to improve consistency and reduce noise: removing comments (except docstrings), normalizing whitespace, standardizing identifier names, and handling language-specific syntax variations. The normalization is applied consistently across all 6 languages using language-specific rules (e.g., Python indentation, Java access modifiers, JavaScript semicolons), enabling models to focus on semantic patterns rather than syntactic variations. Normalization is optional and can be disabled for use cases requiring original code.","intents":["Reduce noise and improve model training by normalizing code syntax across different coding styles","Enable models to learn semantic code patterns rather than syntactic variations","Improve generalization by reducing the vocabulary of code tokens (e.g., treating different identifier names as equivalent)","Prepare code for downstream tasks like code clone detection or code search where syntactic variations should not affect similarity"],"best_for":["ML researchers training code understanding models who want to reduce syntactic noise","Teams building code search systems where syntactic variations should not affect relevance ranking","Practitioners fine-tuning models on domain-specific code with inconsistent formatting"],"limitations":["Normalization may remove important semantic information (e.g., comments that explain intent, whitespace that indicates code structure)","Language-specific normalization rules are heuristic-based and may not handle all edge cases or non-standard code patterns","Normalized code may be less readable for human inspection or debugging","Normalization is lossy — cannot recover original code from normalized version","Different normalization strategies may be optimal for different downstream tasks (e.g., code search vs code clone detection)"],"requires":["Language-specific code formatters or AST-based normalization tools","Python 3.7+"],"input_types":["Raw source code in Python, Java, JavaScript, PHP, Ruby, Go"],"output_types":["Normalized code with consistent formatting, whitespace, and identifier naming","Optional: mapping from normalized code back to original code for inspection"],"categories":["data-processing-analysis","code-generation-editing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"codesearchnet__cap_7","uri":"capability://data.processing.analysis.multi.language.code.tokenization.and.vocabulary","name":"multi-language code tokenization and vocabulary","description":"Provides language-aware tokenization and shared vocabulary for code across 6 programming languages. Tokenization handles language-specific syntax (operators, keywords, delimiters) while creating a unified vocabulary that maps tokens from different languages to shared semantic categories. This enables models to process code from any supported language using a single tokenizer and vocabulary, reducing model complexity and enabling cross-language transfer.","intents":["Tokenize code from multiple languages using a single, unified vocabulary","Enable models to process code from different languages without language-specific preprocessing","Reduce vocabulary size and model complexity by sharing tokens across languages"],"best_for":["ML researchers developing polyglot code models that process multiple languages","Teams implementing code understanding systems that need to support multiple languages","Organizations building code search engines with language-agnostic tokenization"],"limitations":["Shared vocabulary may lose language-specific semantic information — some tokens may have different meanings in different languages","Tokenization quality varies by language — less common languages may have suboptimal tokenization","Vocabulary size is larger than single-language vocabularies due to need to cover all 6 languages","No explicit handling of language-specific keywords or built-in functions — relies on generic tokenization"],"requires":["Code samples from all 6 supported languages","Tokenizer implementation supporting language-specific syntax (e.g., tree-sitter-based or regex-based)","Vocabulary building algorithm (BPE, WordPiece, or SentencePiece)"],"input_types":["Code snippets in any of 6 supported languages","Language identifier or implicit language detection"],"output_types":["Tokenized code (token IDs)","Shared vocabulary mapping tokens to IDs","Token embeddings for downstream models"],"categories":["data-processing-analysis","code-generation-editing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"codesearchnet__headline","uri":"capability://data.processing.analysis.benchmark.dataset.for.code.search","name":"benchmark dataset for code search","description":"A comprehensive benchmark dataset for code search containing 6 million functions across 6 programming languages, paired with natural language documentation, ideal for training and evaluating code understanding models.","intents":["best code search dataset","code search dataset for training models","benchmark for code search evaluation","dataset for code understanding research","free dataset for programming languages"],"best_for":["researchers","developers","data scientists"],"limitations":[],"requires":[],"input_types":[],"output_types":[],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":57,"verified":false,"data_access_risk":"high","permissions":["Hugging Face Datasets library (datasets>=2.0.0)","Python 3.7+","Minimum 50GB disk space for full dataset download (compressed: ~10GB)","Internet connection for initial dataset download from Hugging Face Hub","Code embedding model (CodeBERT, GraphCodeBERT, or custom trained model)","Query embeddings (either pre-computed or generated on-the-fly)","Evaluation script to compute ranking metrics (provided in dataset repository or custom implementation)","Language-specific AST parsers (tree-sitter or language-native parsers)","Sufficient memory to parse large files (some GitHub files exceed 100KB)","Disk space for embeddings (~50GB for full dataset)"],"failure_modes":["Dataset is static snapshot from GitHub circa 2019 — does not reflect modern code patterns, frameworks, or language versions","Docstring quality varies significantly; many functions have minimal or auto-generated documentation, introducing noise for training","Extraction heuristics may misalign function boundaries with docstrings in edge cases (nested functions, decorators, complex inheritance)","Skewed language distribution — Python and Java dominate; Ruby and PHP are underrepresented relative to real-world usage","No temporal information — cannot track how code and documentation evolved or diverged over time","Extraction process does not capture context beyond individual functions (imports, class definitions, module-level state)","Relevance labels are binary (relevant/irrelevant) derived from docstring-code pairs; does not capture partial relevance or semantic similarity gradations","Query distribution is synthetic (derived from docstrings) and may not reflect real user search behavior or intent","Benchmark does not evaluate code search on tasks like bug-finding, security vulnerability detection, or code clone detection","No evaluation of latency, throughput, or scalability — purely accuracy-focused metrics","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.8500000000000001,"ecosystem":0.39999999999999997,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.3,"quality":0.25,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:21.547Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=codesearchnet","compare_url":"https://unfragile.ai/compare?artifact=codesearchnet"}},"signature":"WtTnGVG0fTVZP+PgqAbLeOigOH+jB/H2EX6K2E+Zv5S14OAhHvK3vGjGf1Im1RI6bTFhwBF3PU8YFv8ah4lwBA==","signedAt":"2026-06-21T07:10:17.051Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/codesearchnet","artifact":"https://unfragile.ai/codesearchnet","verify":"https://unfragile.ai/api/v1/verify?slug=codesearchnet","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}