{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"github-thudm--agentbench","slug":"thudm--agentbench","name":"AgentBench","type":"benchmark","url":"https://github.com/THUDM/AgentBench","page_url":"https://unfragile.ai/thudm--agentbench","categories":["testing-quality"],"tags":["chatgpt","gpt-4","llm","llm-agent"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"github-thudm--agentbench__cap_0","uri":"capability://planning.reasoning.multi.environment.llm.agent.evaluation.across.8.standardized.task.domains","name":"multi-environment llm agent evaluation across 8 standardized task domains","description":"Evaluates LLMs as autonomous agents across 8 distinct environments (OS, DB, KG, DCG, LTP, HH, WS, WB) using a standardized Task Interface that defines sample retrieval, execution, and metric calculation. The framework abstracts environment-specific logic behind a common contract, enabling systematic comparison of agent performance across heterogeneous task types with environment-specific startup times (5s-5min) and resource requirements (500MB-15GB). Agents interact with tasks through multi-turn Session management that tracks conversation history and message exchange.","intents":["Compare how different LLMs (GPT-4, ChatGPT, open-source models) perform when operating as autonomous agents","Benchmark agent capabilities across diverse task categories (command-line, database, knowledge graphs, games, web interaction)","Measure agent performance using environment-specific metrics rather than generic accuracy scores","Evaluate both proprietary and open-source LLMs under controlled, reproducible conditions"],"best_for":["LLM researchers evaluating agent capabilities across diverse domains","Teams comparing proprietary vs open-source LLM performance as agents","Organizations building agent systems and needing standardized benchmarks"],"limitations":["Web Shopping and Web Browsing environments require 15GB and 1GB respectively, limiting local evaluation","Startup times vary significantly (5s-5min), making batch evaluation of many samples time-intensive","Environment-specific metrics are not directly comparable across domains, requiring separate analysis per task type","No built-in support for custom evaluation metrics beyond environment-provided ones"],"requires":["Python 3.8+","API keys for proprietary LLMs (OpenAI, Anthropic) or local LLM deployment","15GB+ disk space for Web Shopping environment","Linux environment for OS command-line task execution"],"input_types":["LLM model identifiers (string)","Task environment names (string)","Agent configuration (JSON/YAML)","Sample indices (integer)"],"output_types":["Performance metrics (float/dict)","Agent action traces (structured logs)","Success/failure indicators (boolean)","Conversation history (message sequences)"],"categories":["planning-reasoning","testing-quality","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_1","uri":"capability://tool.use.integration.standardized.task.interface.for.defining.benchmark.environments","name":"standardized task interface for defining benchmark environments","description":"Provides a contract-based Task interface that all benchmark environments implement, defining methods for retrieving sample indices, executing individual samples with agent interactions, and calculating overall performance metrics. The interface abstracts environment-specific logic (game engines, database systems, web simulators) behind common method signatures, enabling the framework to orchestrate agent evaluation without coupling to particular environment implementations. Each task environment implements sample retrieval, step-by-step execution with agent actions, and metric aggregation.","intents":["Define new benchmark tasks without modifying core framework code","Ensure all environments expose consistent APIs for agent interaction","Enable task-agnostic agent evaluation logic that works across all environments","Calculate performance metrics in a standardized way across heterogeneous domains"],"best_for":["Researchers extending AgentBench with custom task environments","Framework maintainers ensuring consistency across 8+ task implementations","Teams building domain-specific agent benchmarks using AgentBench patterns"],"limitations":["Interface abstraction may hide important environment-specific details, requiring documentation per task","Metric calculation must be implemented per-environment, preventing cross-environment metric comparison","No built-in versioning for task definitions, making task evolution tracking difficult","Sample indices are integer-based, limiting support for hierarchical or named sample organization"],"requires":["Python 3.8+","Understanding of AgentBench Task interface contract","Implementation of required methods: get_indices(), execute(), get_metrics()"],"input_types":["Sample index (integer)","Agent action (string/structured)","Task configuration (dict)"],"output_types":["Task state (dict/object)","Observation for agent (string/structured)","Metrics (float/dict)","Done flag (boolean)"],"categories":["tool-use-integration","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_10","uri":"capability://tool.use.integration.web.shopping.task.environment.with.e.commerce.interaction.simulation","name":"web shopping task environment with e-commerce interaction simulation","description":"Provides a web shopping task environment where agents interact with a simulated e-commerce platform to complete shopping tasks (product search, comparison, purchase). Agents navigate product catalogs, read descriptions and reviews, manage shopping carts, and complete transactions through a web interface. The environment simulates realistic e-commerce workflows with product filtering, price comparison, and checkout processes. Tasks evaluate agent capabilities in information seeking, decision-making under uncertainty, and multi-step task completion in a complex web environment (~15GB resource requirement).","intents":["Evaluate LLM agents' ability to navigate and complete tasks in e-commerce environments","Test agent capabilities in product search, comparison, and decision-making","Measure agent performance in multi-step web-based task completion","Analyze agent information-seeking strategies and purchase decision patterns"],"best_for":["Researchers evaluating LLM agents' web navigation and decision-making capabilities","Teams testing agent e-commerce task completion and information seeking","Developers analyzing agent product comparison and purchase reasoning"],"limitations":["Web shopping environment requires ~15GB disk space, limiting local evaluation","Simulated e-commerce may not capture all real-world complexity and edge cases","Agent performance depends on product catalog design and task specification","No support for dynamic pricing or inventory changes during evaluation"],"requires":["Python 3.8+","15GB+ disk space for web shopping environment","Simulated e-commerce platform (WebShop or similar)","Product catalog and pricing data"],"input_types":["Task description (string: product to find, budget, preferences)","Web page content (HTML/text)","Product information (name, price, reviews, specifications)","Shopping cart state (list of items)"],"output_types":["Web action (click, search, add to cart, checkout)","Search query (string)","Product selection (product ID)","Purchase confirmation (boolean)","Performance metrics (task success, efficiency, cost optimization)"],"categories":["tool-use-integration","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_11","uri":"capability://search.retrieval.web.browsing.task.environment.with.multi.page.navigation.and.information.retrieval","name":"web browsing task environment with multi-page navigation and information retrieval","description":"Provides a web browsing task environment where agents navigate websites to find information and complete web-based tasks. Agents interact with a simulated web browser, following links, reading page content, and performing searches to locate specific information. The environment simulates realistic web navigation with multiple pages, search results, and information density variations. Tasks evaluate agent capabilities in web navigation, information retrieval, and multi-step task completion in open-ended web environments (~1GB resource requirement, ~5min startup).","intents":["Evaluate LLM agents' ability to navigate websites and retrieve information","Test agent capabilities in web search and information location tasks","Measure agent performance in multi-step web-based information retrieval","Analyze agent web navigation strategies and information-seeking patterns"],"best_for":["Researchers evaluating LLM agents' web navigation and information retrieval capabilities","Teams testing agent web search and information location performance","Developers analyzing agent navigation strategies and search query composition"],"limitations":["Web browsing environment requires ~1GB disk space and ~5min startup time","Simulated web pages may not capture all real-world HTML complexity","Agent performance depends on page layout and information organization","No support for JavaScript-heavy pages or dynamic content loading"],"requires":["Python 3.8+","1GB+ disk space for web browsing environment","Simulated web environment (Mind2Web or similar)","Web page corpus and navigation graph"],"input_types":["Task description (string: information to find, question to answer)","Web page content (HTML/text)","Search results (list of pages)","Navigation history (list of visited pages)"],"output_types":["Web action (click link, search, scroll, go back)","Search query (string)","Retrieved information (string/structured)","Task completion status (boolean)","Performance metrics (information retrieval accuracy, navigation efficiency)"],"categories":["search-retrieval","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_12","uri":"capability://planning.reasoning.household.task.environment.with.interactive.home.simulation.alfworld.based","name":"household task environment with interactive home simulation (alfworld-based)","description":"Provides a household task environment where agents complete domestic tasks in a simulated home environment (based on ALFWorld). Agents interact with a text-based or visual home simulator, manipulating objects, navigating rooms, and completing household chores (cooking, cleaning, organizing). The environment simulates realistic household physics and object interactions, requiring agents to reason about spatial relationships, object properties, and task decomposition. Tasks evaluate agent capabilities in embodied reasoning, multi-step task planning, and interactive problem-solving.","intents":["Evaluate LLM agents' ability to reason about and complete household tasks","Test agent capabilities in spatial reasoning and object manipulation","Measure agent performance in multi-step task planning and execution","Analyze agent reasoning about household physics and object interactions"],"best_for":["Researchers evaluating LLM agents' embodied reasoning and task planning capabilities","Teams testing agent household task completion and spatial reasoning","Developers analyzing agent multi-step planning and error recovery"],"limitations":["Household simulation may not capture all real-world complexity and edge cases","Agent performance depends on task specification and environment design","Spatial reasoning may exceed some LLM agents' capabilities","No support for dynamic household changes or unexpected obstacles"],"requires":["Python 3.8+","Household simulation environment (ALFWorld or similar)","Object definitions and interaction rules","Room layouts and spatial information"],"input_types":["Task description (string: household chore to complete)","Environment state (text description or visual)","Available objects and their properties (list)","Agent location and inventory (structured)"],"output_types":["Agent action (move, pick up, put down, use object)","Navigation command (go to room/location)","Object manipulation (interact with object)","Task completion status (boolean)","Performance metrics (task success, efficiency, action count)"],"categories":["planning-reasoning","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_13","uri":"capability://planning.reasoning.lateral.thinking.puzzle.task.environment.with.constraint.based.reasoning","name":"lateral thinking puzzle task environment with constraint-based reasoning","description":"Provides a lateral thinking puzzle task environment where agents solve puzzles requiring creative, non-linear reasoning and constraint satisfaction. Agents interact with a puzzle system that presents scenarios, accepts guesses/hypotheses, and provides feedback on correctness. The environment manages puzzle state, constraint tracking, and solution validation. Tasks evaluate agent capabilities in creative problem-solving, hypothesis generation, constraint reasoning, and iterative refinement. Agents must think beyond obvious solutions and reason about implicit constraints.","intents":["Evaluate LLM agents' ability to solve lateral thinking puzzles requiring creative reasoning","Test agent capabilities in constraint satisfaction and hypothesis generation","Measure agent performance in iterative problem-solving and refinement","Analyze agent reasoning patterns in non-linear problem domains"],"best_for":["Researchers evaluating LLM agents' creative reasoning and lateral thinking capabilities","Teams testing agent constraint reasoning and hypothesis generation","Developers analyzing agent iterative problem-solving strategies"],"limitations":["Lateral thinking puzzles are subjective, making evaluation criteria ambiguous","Agent performance depends on puzzle design and constraint clarity","Some puzzles may have multiple valid solutions, complicating evaluation","Agent reasoning may diverge from puzzle designer's intended solution path"],"requires":["Python 3.8+","Puzzle definitions with scenarios and constraints","Solution validation logic","Feedback generation for agent guesses"],"input_types":["Puzzle scenario (string)","Constraints (list of strings)","Agent guess/hypothesis (string)","Feedback history (list of previous guesses and feedback)"],"output_types":["Agent hypothesis/guess (string)","Reasoning explanation (string)","Feedback (correct/incorrect/partial)","Solution (string)","Performance metrics (puzzle success, guess efficiency, reasoning quality)"],"categories":["planning-reasoning","testing-quality"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_14","uri":"capability://planning.reasoning.digital.card.game.task.environment.with.strategic.decision.making","name":"digital card game task environment with strategic decision-making","description":"Provides a digital card game task environment where agents play strategic card games requiring decision-making, resource management, and opponent modeling. Agents receive game state information (hand, board, opponent state), select actions (play cards, attack, defend), and observe game outcomes. The environment manages game rules, turn order, win conditions, and card interactions. Tasks evaluate agent capabilities in strategic reasoning, resource optimization, and decision-making under uncertainty. Agents must balance multiple objectives and adapt strategies based on game state.","intents":["Evaluate LLM agents' ability to play strategic card games","Test agent capabilities in resource management and decision-making","Measure agent performance in strategic reasoning and opponent modeling","Analyze agent decision-making patterns and strategy adaptation"],"best_for":["Researchers evaluating LLM agents' strategic reasoning and game-playing capabilities","Teams testing agent decision-making in resource-constrained environments","Developers analyzing agent strategy adaptation and opponent modeling"],"limitations":["Card game complexity may exceed some LLM agents' reasoning capabilities","Game outcomes depend on card draws and randomness, affecting reproducibility","Agent performance depends on game design and card balance","No support for simultaneous multi-agent card games"],"requires":["Python 3.8+","Card game engine with rules and card definitions","Game state management and turn coordination","Card interaction logic"],"input_types":["Game state (dict with hand, board, resources, opponent state)","Available actions (list of playable cards and actions)","Game history (list of previous turns)","Card definitions (properties, effects, costs)"],"output_types":["Agent action (play card, attack, defend, pass)","Card selection (card ID)","Target selection (opponent or board position)","Game outcome (win/loss)","Performance metrics (win rate, resource efficiency, decision quality)"],"categories":["planning-reasoning","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_15","uri":"capability://automation.workflow.configuration.driven.task.and.agent.setup.with.yaml.json.specifications","name":"configuration-driven task and agent setup with yaml/json specifications","description":"Provides a configuration system that enables users to define task environments, agent parameters, and evaluation assignments through YAML or JSON configuration files. The configuration system abstracts away code-level customization, enabling non-developers to set up benchmarks by editing configuration files. Supports task-specific parameters (environment type, sample count, resource limits), agent-specific parameters (model, temperature, prompt template), and assignment-level parameters (worker count, timeout). Configuration validation ensures correctness before execution.","intents":["Set up benchmark evaluations without writing code by editing configuration files","Reproduce benchmark runs by sharing configuration files","Experiment with different agent parameters (temperature, prompt) without code changes","Configure task-specific parameters (sample count, resource limits) for different evaluation scales"],"best_for":["Non-technical users setting up benchmarks through configuration files","Teams sharing reproducible benchmark configurations across researchers","Developers experimenting with different agent parameters and task settings"],"limitations":["Configuration files can become complex for advanced customization","No built-in validation for semantic correctness (e.g., invalid model names)","Configuration schema is not versioned, making upgrades potentially breaking","Limited support for dynamic configuration based on runtime conditions"],"requires":["Python 3.8+","YAML or JSON configuration files","Understanding of configuration schema and available options"],"input_types":["Configuration file (YAML/JSON)","Task name (string)","Agent model name (string)","Hyperparameters (dict)"],"output_types":["Parsed configuration (dict)","Validation errors (list of strings)","Instantiated tasks and agents (objects)"],"categories":["automation-workflow","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_2","uri":"capability://planning.reasoning.agent.interface.with.standardized.decision.making.and.session.communication","name":"agent interface with standardized decision-making and session communication","description":"Defines a standardized Agent interface that abstracts how LLMs and other decision-makers interact with task environments through a Session communication channel. Agents receive observations from tasks, generate actions, and receive feedback in a multi-turn loop. The interface supports both sophisticated LLM-based agents (with prompt engineering, chain-of-thought reasoning) and naive rule-based agents, enabling comparison of different agent architectures. Session management tracks conversation history and message exchange, providing agents with context for decision-making.","intents":["Implement LLM-based agents that interact with task environments through standardized APIs","Compare different agent architectures (LLM-based vs rule-based) on the same tasks","Track agent decision-making process and conversation history for analysis","Support both proprietary LLMs (GPT-4, Claude) and open-source models (Llama, Mistral)"],"best_for":["Researchers implementing custom agent strategies for benchmark evaluation","Teams comparing LLM-based agents against baseline/naive agents","Developers analyzing agent decision traces and conversation patterns"],"limitations":["Agent interface does not enforce action validation, requiring tasks to handle invalid actions","No built-in support for agent memory persistence across multiple task instances","Session history grows unbounded, potentially causing context window overflow for long interactions","No standardized way to implement agent exploration vs exploitation trade-offs"],"requires":["Python 3.8+","Implementation of Agent interface with act() method","Session object for managing agent-task communication","LLM API access (OpenAI, Anthropic) or local model deployment"],"input_types":["Observation from task (string/structured)","Session history (message list)","Agent configuration (dict)"],"output_types":["Agent action (string/structured)","Reasoning trace (optional string)","Confidence score (optional float)"],"categories":["planning-reasoning","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_3","uri":"capability://memory.knowledge.session.based.multi.turn.conversation.management.between.agents.and.tasks","name":"session-based multi-turn conversation management between agents and tasks","description":"Implements a Session abstraction that manages the communication channel between agents and task environments, handling message exchange, conversation history tracking, and state synchronization across multiple turns. Sessions maintain a chronological record of agent observations, actions, and task feedback, enabling agents to make decisions based on accumulated context. The Session interface standardizes how agents receive observations and submit actions, decoupling agent logic from environment-specific communication protocols.","intents":["Maintain conversation history for agent decision-making across multiple interaction turns","Synchronize state between agent and task environment to prevent desynchronization","Enable analysis of agent reasoning patterns by examining full conversation traces","Support context-aware agent behavior that leverages previous interactions"],"best_for":["Analyzing agent behavior through conversation traces and decision logs","Implementing agents that require multi-turn context (e.g., dialogue-based reasoning)","Debugging agent-environment interactions by examining message sequences"],"limitations":["Session history is stored in memory, limiting evaluation of very long interactions (100+ turns)","No built-in compression or summarization of conversation history, causing context window issues","Session state is not persisted by default, requiring manual serialization for checkpointing","No conflict resolution mechanism if agent and task state diverge"],"requires":["Python 3.8+","Task and Agent implementations that conform to Session interface","Sufficient memory for storing conversation history (typically <10MB per session)"],"input_types":["Observation message (string/structured)","Action message (string/structured)","Metadata (dict with timestamps, turn numbers)"],"output_types":["Conversation history (list of messages)","Current state (dict)","Turn count (integer)","Serialized session (JSON/pickle)"],"categories":["memory-knowledge","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_4","uri":"capability://automation.workflow.task.controller.orchestration.with.distributed.task.execution.and.resource.management","name":"task controller orchestration with distributed task execution and resource management","description":"Implements a Task Controller that orchestrates the execution of benchmark tasks across multiple workers, managing resource allocation, task assignment, and result aggregation. The controller uses a Task Assigner to distribute samples across workers and a pool of Task Workers to execute agent-task interactions in parallel. This architecture enables efficient evaluation of agents across large sample sets while managing system resources (memory, CPU, disk) and handling task startup/teardown. The controller coordinates the lifecycle of task environments (initialization, sample execution, metric calculation, cleanup).","intents":["Evaluate agents across large sample sets efficiently using parallel task execution","Manage resource constraints (memory, disk, startup time) across multiple task environments","Aggregate results from distributed task workers into unified performance metrics","Handle task environment lifecycle (startup, execution, shutdown) automatically"],"best_for":["Large-scale agent evaluation requiring parallel execution across multiple samples","Resource-constrained environments where task startup overhead must be amortized","Teams running comprehensive benchmarks across all 8 environments simultaneously"],"limitations":["Parallel execution introduces non-determinism due to timing variations, affecting reproducibility","Resource management is static (fixed worker count), not adaptive to varying task demands","No built-in support for fault tolerance or task retry on failure","Distributed execution adds complexity, making debugging agent-task interactions harder"],"requires":["Python 3.8+","Sufficient system resources (CPU cores, memory, disk) for parallel task execution","Task and Agent implementations that are thread-safe or process-isolated","Configuration of worker count and resource limits"],"input_types":["Task configuration (dict)","Agent configuration (dict)","Sample indices (list of integers)","Worker count (integer)"],"output_types":["Aggregated metrics (dict)","Per-sample results (list of dicts)","Execution logs (structured)","Resource usage statistics (dict)"],"categories":["automation-workflow","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_5","uri":"capability://data.processing.analysis.environment.specific.metric.calculation.and.performance.aggregation","name":"environment-specific metric calculation and performance aggregation","description":"Provides a standardized Evaluation Metrics subsystem where each task environment implements domain-specific metric calculation (e.g., success rate for games, SQL correctness for databases, task completion for household tasks). The framework aggregates per-sample metrics into overall performance scores while preserving environment-specific semantics. Metrics are calculated after task execution completes, enabling post-hoc analysis and comparison across agents. The metric interface supports both binary success indicators and continuous performance scores.","intents":["Calculate performance metrics that are meaningful for each task domain (not generic accuracy)","Compare agent performance across environments using domain-appropriate metrics","Aggregate individual sample results into overall benchmark scores","Enable detailed performance analysis by examining per-sample metric breakdowns"],"best_for":["Researchers analyzing agent performance across heterogeneous task domains","Teams creating leaderboards or rankings of agent performance","Developers implementing custom metrics for new task environments"],"limitations":["Metrics are not directly comparable across environments (e.g., game success rate vs SQL correctness)","No built-in statistical significance testing or confidence intervals","Metric calculation is environment-specific, requiring custom implementation per task","No support for weighted aggregation across environments with different importance"],"requires":["Python 3.8+","Task implementation with get_metrics() method","Per-sample execution results (agent actions, task states)","Understanding of environment-specific success criteria"],"input_types":["Per-sample execution trace (dict)","Task environment type (string)","Agent actions and observations (list)"],"output_types":["Per-sample metrics (dict with float/bool values)","Aggregated metrics (dict with mean, std, min, max)","Metric breakdowns (dict of lists for detailed analysis)","Leaderboard scores (float)"],"categories":["data-processing-analysis","testing-quality"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_6","uri":"capability://planning.reasoning.avalon.game.environment.with.strategic.multi.agent.gameplay.simulation","name":"avalon game environment with strategic multi-agent gameplay simulation","description":"Implements a complete Avalon card game environment where LLM agents play a social deduction game requiring strategic reasoning, communication, and deception detection. The environment includes a game engine that manages game state, turn order, voting mechanics, and win conditions, while agents interact through natural language communication and action selection. The Avalon task evaluates agent capabilities in multi-agent strategic reasoning, persuasion, and information inference from incomplete information. Agents must balance exploration (gathering information) with exploitation (making winning moves).","intents":["Evaluate LLM agents' ability to play strategic games requiring multi-turn reasoning","Test agent capabilities in social deduction and persuasion within a game context","Measure agent performance in multi-agent environments with competing objectives","Analyze agent communication patterns and strategic decision-making in games"],"best_for":["Researchers studying LLM agents in multi-agent strategic environments","Teams evaluating agent reasoning in games with incomplete information","Developers analyzing agent communication and persuasion strategies"],"limitations":["Game outcomes depend on all agents' strategies, making single-agent evaluation difficult","Avalon game complexity may exceed some LLM agents' reasoning capabilities","Game state space is large, making exhaustive evaluation computationally expensive","Multi-agent interactions introduce non-determinism, affecting reproducibility"],"requires":["Python 3.8+","Avalon game engine implementation","Multiple LLM agents or mix of LLM and baseline agents","Game state management and turn coordination"],"input_types":["Game state (dict with player roles, votes, history)","Agent role (string: 'good', 'evil', 'merlin')","Game history (list of previous rounds)","Available actions (list of strings)"],"output_types":["Agent action (string: vote, propose, claim)","Agent communication (natural language)","Game outcome (win/loss)","Performance metrics (win rate, persuasion effectiveness)"],"categories":["planning-reasoning","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_7","uri":"capability://tool.use.integration.operating.system.command.line.task.environment.with.linux.shell.interaction","name":"operating system command-line task environment with linux shell interaction","description":"Provides a Linux OS command-line task environment where agents interact with a shell interface to complete system administration and file manipulation tasks. Agents receive shell prompts, issue commands, and observe command output in a multi-turn interaction loop. The environment manages a sandboxed Linux filesystem and command execution, enabling safe evaluation of agent capabilities in command-line reasoning and system administration. Tasks include file operations, text processing, system queries, and scripting.","intents":["Evaluate LLM agents' ability to reason about and execute shell commands","Test agent capabilities in file system navigation and manipulation","Measure agent performance in system administration and troubleshooting tasks","Analyze agent command-line reasoning patterns and error recovery"],"best_for":["Researchers evaluating LLM agents' system administration capabilities","Teams testing agent reasoning in command-line environments","Developers analyzing agent error recovery and command composition"],"limitations":["Sandboxed environment may not support all Linux commands or system features","Command execution is sequential, limiting agent parallelization strategies","Error messages from failed commands may be cryptic, challenging agent interpretation","No support for interactive commands (vim, less, etc.) that require terminal control"],"requires":["Python 3.8+","Linux environment or Docker container for sandboxing","Shell command execution capability (subprocess module)","Filesystem isolation for safety"],"input_types":["Shell prompt (string)","Previous command output (string)","Task description (string)","Available filesystem state (implicit)"],"output_types":["Shell command (string)","Command output (string)","Task completion status (boolean)","Performance metrics (success rate, command efficiency)"],"categories":["tool-use-integration","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_8","uri":"capability://tool.use.integration.database.sql.query.task.environment.with.schema.aware.interaction","name":"database sql query task environment with schema-aware interaction","description":"Provides a database task environment where agents interact with SQL databases to complete data querying and manipulation tasks. Agents receive database schemas, issue SQL queries, and observe query results in a multi-turn loop. The environment manages a sandboxed database instance with predefined schemas and data, enabling evaluation of agent capabilities in SQL reasoning, schema understanding, and query composition. Tasks include data retrieval, aggregation, filtering, and complex joins.","intents":["Evaluate LLM agents' ability to reason about database schemas and compose SQL queries","Test agent capabilities in data retrieval and manipulation tasks","Measure agent performance in understanding relational database concepts","Analyze agent query composition patterns and error recovery from SQL errors"],"best_for":["Researchers evaluating LLM agents' database reasoning capabilities","Teams testing agent SQL composition and schema understanding","Developers analyzing agent error recovery from SQL syntax/logic errors"],"limitations":["SQL dialect variations (MySQL, PostgreSQL, SQLite) may affect agent performance","Complex queries with multiple joins or subqueries may exceed agent reasoning capabilities","Query optimization is not evaluated, only correctness","No support for stored procedures or advanced SQL features"],"requires":["Python 3.8+","Database system (SQLite, PostgreSQL, MySQL)","Predefined database schemas and sample data","SQL query execution and result parsing"],"input_types":["Database schema (string or structured)","Task description (string)","Previous query results (string/table)","Available tables and columns (implicit)"],"output_types":["SQL query (string)","Query results (table/structured)","Query success/error status (boolean/string)","Performance metrics (query correctness, efficiency)"],"categories":["tool-use-integration","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-thudm--agentbench__cap_9","uri":"capability://memory.knowledge.knowledge.graph.querying.and.reasoning.task.environment","name":"knowledge graph querying and reasoning task environment","description":"Provides a knowledge graph task environment where agents query and reason over structured knowledge representations to answer questions and complete reasoning tasks. Agents interact with a knowledge graph API, issuing queries to retrieve entities, relationships, and perform multi-hop reasoning. The environment manages a sandboxed knowledge graph with predefined entities and relationships, enabling evaluation of agent capabilities in semantic reasoning, relationship inference, and multi-step knowledge navigation. Tasks include entity lookup, relationship discovery, and transitive reasoning.","intents":["Evaluate LLM agents' ability to reason over structured knowledge representations","Test agent capabilities in multi-hop reasoning and relationship inference","Measure agent performance in semantic understanding and entity linking","Analyze agent knowledge graph navigation patterns and reasoning strategies"],"best_for":["Researchers evaluating LLM agents' semantic reasoning capabilities","Teams testing agent knowledge graph navigation and multi-hop reasoning","Developers analyzing agent entity linking and relationship discovery"],"limitations":["Knowledge graph completeness affects task difficulty and agent performance","Multi-hop reasoning may require agents to maintain state across multiple queries","Entity disambiguation is not handled, assuming unique entity names","No support for temporal reasoning or dynamic knowledge updates"],"requires":["Python 3.8+","Knowledge graph system (RDF, property graph, or custom)","Query API for entity and relationship lookup","Predefined knowledge graph with entities and relationships"],"input_types":["Query (string or structured)","Entity/relationship names (string)","Previous query results (structured)","Knowledge graph schema (implicit)"],"output_types":["Query results (entities, relationships, paths)","Reasoning trace (multi-hop path)","Answer to question (string/structured)","Performance metrics (reasoning correctness, query efficiency)"],"categories":["memory-knowledge","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":35,"verified":false,"data_access_risk":"high","permissions":["Python 3.8+","API keys for proprietary LLMs (OpenAI, Anthropic) or local LLM deployment","15GB+ disk space for Web Shopping environment","Linux environment for OS command-line task execution","Understanding of AgentBench Task interface contract","Implementation of required methods: get_indices(), execute(), get_metrics()","Simulated e-commerce platform (WebShop or similar)","Product catalog and pricing data","1GB+ disk space for web browsing environment","Simulated web environment (Mind2Web or similar)"],"failure_modes":["Web Shopping and Web Browsing environments require 15GB and 1GB respectively, limiting local evaluation","Startup times vary significantly (5s-5min), making batch evaluation of many samples time-intensive","Environment-specific metrics are not directly comparable across domains, requiring separate analysis per task type","No built-in support for custom evaluation metrics beyond environment-provided ones","Interface abstraction may hide important environment-specific details, requiring documentation per task","Metric calculation must be implemented per-environment, preventing cross-environment metric comparison","No built-in versioning for task definitions, making task evolution tracking difficult","Sample indices are integer-based, limiting support for hierarchical or named sample organization","Web shopping environment requires ~15GB disk space, limiting local evaluation","Simulated e-commerce may not capture all real-world complexity and edge cases","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.2940792038561906,"quality":0.35,"ecosystem":0.52,"match_graph":0.25,"freshness":0.6,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.064Z","last_scraped_at":"2026-05-03T13:57:11.504Z","last_commit":"2026-02-08T17:01:05Z"},"community":{"stars":3386,"forks":251,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=thudm--agentbench","compare_url":"https://unfragile.ai/compare?artifact=thudm--agentbench"}},"signature":"94Ug+/ihscvZEtR6hW6HP7RKIJvRDEsGUZrvIWb6VaU1oudkuKzWs3z08cwd6pPCgeAjwbCfWwI+KH2cIfCuBA==","signedAt":"2026-06-22T18:33:24.916Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/thudm--agentbench","artifact":"https://unfragile.ai/thudm--agentbench","verify":"https://unfragile.ai/api/v1/verify?slug=thudm--agentbench","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}