{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"papers-with-code-evalplus","slug":"evalplus","name":"EvalPlus","type":"benchmark","url":"https://paperswithcode.com/dataset/evalplus","page_url":"https://unfragile.ai/evalplus","categories":["testing-quality"],"tags":["benchmark","code-generation","evaluation","humaneval"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"papers-with-code-evalplus__cap_0","uri":"capability://code.generation.editing.extended.test.case.generation.for.code.evaluation","name":"extended test case generation for code evaluation","description":"EvalPlus enhances the HumanEval benchmark by providing additional, more challenging test cases for each of the original 164 problems, extending the evaluation scope to over 40 test cases per problem. This is achieved by systematically generating diverse edge cases and complex scenarios that challenge models to demonstrate true coding proficiency rather than simply overfitting to the original tests. The approach focuses on rigorous evaluation, ensuring that models are tested against a broader range of inputs and conditions, which is crucial for assessing their real-world applicability.","intents":["How can I rigorously evaluate my model's coding capabilities?","What benchmark can I use to ensure my AI doesn't overfit to simpler test cases?","I need a comprehensive set of test cases for code generation models."],"best_for":["researchers validating AI code generation models","developers looking for robust evaluation metrics"],"limitations":["Test cases may not cover all possible edge cases, leading to potential gaps in evaluation","Requires significant computational resources for extensive testing"],"requires":["Python 3.7+","Access to the EvalPlus dataset"],"input_types":["code"],"output_types":["structured data","evaluation metrics"],"categories":["code-generation-editing","benchmarking"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":44,"verified":false,"data_access_risk":"low","permissions":["Python 3.7+","Access to the EvalPlus dataset"],"failure_modes":["Test cases may not cover all possible edge cases, leading to potential gaps in evaluation","Requires significant computational resources for extensive testing","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.8,"quality":0.27,"ecosystem":0.42,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:25.060Z","last_scraped_at":"2026-05-03T15:20:49.428Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=evalplus","compare_url":"https://unfragile.ai/compare?artifact=evalplus"}},"signature":"pbSkgtXA5GgSQKL6XQewQJalmqzOwrYqQ1uGUDnGoEKpvkAYdWttKTNBSljpF4dxLH7yT3vv/ChuWm2VW+4ACw==","signedAt":"2026-06-22T12:37:20.185Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/evalplus","artifact":"https://unfragile.ai/evalplus","verify":"https://unfragile.ai/api/v1/verify?slug=evalplus","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}