{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"papers-with-code-swe-bench","slug":"swe-bench","name":"SWE-bench","type":"benchmark","url":"https://paperswithcode.com/dataset/swe-bench","page_url":"https://unfragile.ai/swe-bench","categories":["testing-quality","deployment-infra"],"tags":["benchmark","code-generation","evaluation","software-engineering","github"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"papers-with-code-swe-bench__cap_0","uri":"capability://code.generation.editing.real.world.bug.detection.evaluation","name":"real-world bug detection evaluation","description":"SWE-bench evaluates AI systems by testing their ability to locate bugs in real-world codebases sourced from GitHub issues. It utilizes a dataset of actual software engineering tasks, which allows for more realistic assessments compared to synthetic benchmarks like HumanEval. The evaluation framework is designed to simulate real-world scenarios, ensuring that models are tested against practical challenges faced by developers.","intents":["How can I assess my AI model's ability to find bugs in real code?","What benchmark can I use to evaluate bug detection capabilities in AI?","I need a realistic dataset to test my autonomous coding agent."],"best_for":["AI researchers developing bug detection models","developers testing autonomous coding agents"],"limitations":["Limited to the scope of tasks available in the dataset, which may not cover all programming languages or frameworks","Dependent on the quality and variety of the GitHub issues included"],"requires":["Python 3.7+","Access to the SWE-bench dataset"],"input_types":["code"],"output_types":["structured data","evaluation metrics"],"categories":["code-generation-editing","benchmarking"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"papers-with-code-swe-bench__cap_1","uri":"capability://code.generation.editing.automated.fix.writing.evaluation","name":"automated fix writing evaluation","description":"This capability assesses the ability of AI models to generate fixes for identified bugs within real codebases. SWE-bench evaluates how well models can not only detect issues but also propose appropriate code modifications. The evaluation framework includes a variety of bug types and contexts, ensuring that the models are tested against a wide range of scenarios that developers encounter in practice.","intents":["How can I evaluate my AI's ability to write code fixes?","What metrics should I use to assess fix generation in my model?","I want to benchmark my autonomous coding agent's fix writing capabilities."],"best_for":["developers creating AI-assisted coding tools","researchers focused on code generation"],"limitations":["Evaluation may not cover all programming languages or frameworks present in the dataset","Fixes generated may require manual review for correctness"],"requires":["Python 3.7+","Access to the SWE-bench dataset"],"input_types":["code","bug reports"],"output_types":["structured data","evaluation metrics"],"categories":["code-generation-editing","benchmarking"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"papers-with-code-swe-bench__cap_2","uri":"capability://code.generation.editing.test.suite.passing.evaluation","name":"test suite passing evaluation","description":"SWE-bench evaluates whether AI-generated fixes can pass existing test suites in real codebases. This capability ensures that the proposed solutions not only address the bugs but also maintain the integrity of the software by passing all relevant tests. The evaluation framework integrates with various testing frameworks to verify that the code modifications do not introduce new issues.","intents":["How can I test if my AI-generated fixes are reliable?","What benchmark can I use to ensure my model's fixes pass existing tests?","I need to validate the correctness of code changes made by my AI."],"best_for":["QA engineers testing AI-generated code","developers ensuring code quality"],"limitations":["Dependent on the availability and comprehensiveness of the test suites in the dataset","May not cover all edge cases present in real-world applications"],"requires":["Python 3.7+","Access to the SWE-bench dataset"],"input_types":["code","test cases"],"output_types":["structured data","pass/fail metrics"],"categories":["code-generation-editing","benchmarking"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":51,"verified":false,"data_access_risk":"high","permissions":["Python 3.7+","Access to the SWE-bench dataset"],"failure_modes":["Limited to the scope of tasks available in the dataset, which may not cover all programming languages or frameworks","Dependent on the quality and variety of the GitHub issues included","Evaluation may not cover all programming languages or frameworks present in the dataset","Fixes generated may require manual review for correctness","Dependent on the availability and comprehensiveness of the test suites in the dataset","May not cover all edge cases present in real-world applications","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.8,"quality":0.41,"ecosystem":0.55,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:25.060Z","last_scraped_at":"2026-05-03T15:20:49.428Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=swe-bench","compare_url":"https://unfragile.ai/compare?artifact=swe-bench"}},"signature":"bQ7GLoYkvJ1TI52iOfktqLUPvyWpGzJzfEMFZw2Dwu4M1XXCHXD0m9AGIOphjN1w8H5gqFLPJwLn/fCBwft4AA==","signedAt":"2026-06-22T15:54:08.159Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/swe-bench","artifact":"https://unfragile.ai/swe-bench","verify":"https://unfragile.ai/api/v1/verify?slug=swe-bench","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}