{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"arc-agi","slug":"arc-agi","name":"ARC-AGI","type":"benchmark","url":"https://arcprize.org","page_url":"https://unfragile.ai/arc-agi","categories":["testing-quality"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"arc-agi__cap_0","uri":"capability://planning.reasoning.interactive.visual.puzzle.task.generation","name":"interactive-visual-puzzle-task-generation","description":"Generates and renders abstract visual puzzle tasks as interactive game environments where agents must explore state spaces, plan actions, and achieve goals through a Percept → Plan → Action cycle. Tasks are presented in configurable rendering modes (terminal text-based or programmatic API access) and support memory persistence across action sequences, enabling agents to learn patterns from minimal examples.","intents":["Evaluate whether an AI system can recognize novel abstract patterns from 1-5 training examples","Test an agent's ability to explore and reason about unfamiliar visual puzzle environments","Measure learning efficiency by observing how quickly a system generalizes from limited task demonstrations","Benchmark fluid intelligence and abstract reasoning capabilities independent of language understanding"],"best_for":["AI researchers measuring general reasoning capabilities","Teams developing reasoning-focused LLM agents","Benchmark participants competing in the ARC Prize 2026"],"limitations":["Visual-only format excludes language-based reasoning; no text input/output in puzzle solving","Task specifics (grid dimensions, color palettes, transformation rules) not fully documented in public materials","No dynamic task generation or rotation mentioned; contamination risk high in active competition environment","Evaluation protocol and statistical rigor not formally specified; no confidence intervals or significance testing documented","Single-agent focus; does not measure multi-agent coordination or collaborative reasoning"],"requires":["Python 3.8+","arc-agi package (installable via pip install arc-agi or uv add arc-agi)","Optional: ARC_API_KEY environment variable for public game access (anonymous key available with limitations)"],"input_types":["game_state (visual grid representation)","GameAction enum (discrete action space)","render_mode parameter (terminal or programmatic)"],"output_types":["rendered_observation (text or structured state)","reward signal (implicit success/failure)","scorecard (structured evaluation results)"],"categories":["planning-reasoning","benchmark"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_1","uri":"capability://automation.workflow.local.python.sdk.task.execution","name":"local-python-sdk-task-execution","description":"Provides a Python SDK (arc-agi package) for local execution of benchmark tasks with configurable rendering modes and performance optimization. The SDK exposes a GameAction class for discrete action specification, an Arcade environment factory for task instantiation, and a scorecard evaluation system. Execution runs entirely client-side without mandatory cloud dependencies, achieving 2000+ FPS when rendering is disabled.","intents":["Run benchmark tasks locally without network latency or API rate limits","Integrate ARC-AGI evaluation into custom agent training pipelines","Optimize evaluation throughput by disabling rendering for batch evaluation","Access task state programmatically for custom analysis and visualization"],"best_for":["Researchers with local compute resources and reproducibility requirements","Teams building custom agents requiring tight integration with evaluation loop","Developers optimizing for evaluation throughput in iterative development"],"limitations":["Local execution requires sufficient disk space and memory for all task environments","Rendering mode (terminal) adds latency; exact overhead not quantified","No built-in sandboxing or resource limits documented; malicious agents could consume unbounded compute","API key optional but recommended; anonymous access has undocumented limitations","Scorecard output structure not formally specified; reverse-engineering required for custom metrics"],"requires":["Python 3.8+","pip or uv package manager","Optional: .env file with ARC_API_KEY for full feature access"],"input_types":["GameAction enum (discrete action space)","render_mode string (terminal or None)","task_id string (e.g., ls20, ft09)"],"output_types":["observation (rendered state or structured representation)","done flag (boolean task completion)","scorecard (evaluation metrics structure)"],"categories":["automation-workflow","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_10","uri":"capability://automation.workflow.environment.step.based.interaction.loop","name":"environment-step-based-interaction-loop","description":"Implements the core agent-environment interaction loop through env.step(action), which executes an action, updates task state, and returns observations. The step function encapsulates the Percept → Plan → Action cycle, enabling agents to iteratively explore tasks and learn patterns. Step returns observation, done flag, and implicit feedback enabling agents to assess action effectiveness.","intents":["Enable agents to iteratively interact with tasks through action-observation cycles","Support multi-step planning and learning across action sequences","Provide feedback on action effectiveness through state changes","Enable episode termination detection for task completion"],"best_for":["Agents using iterative planning or reinforcement learning","Teams developing multi-step reasoning approaches","Researchers studying agent exploration and learning dynamics"],"limitations":["Step function signature not formally specified; return values undefined","Episode termination criteria not documented; unclear when done flag is set","Implicit reward mechanism not specified; agents must infer success from observations","Maximum episode length not documented; potential for infinite loops","No step limit or timeout mechanism documented"],"requires":["Environment instance (via Arcade.make())","GameAction enum value for action selection"],"input_types":["action (GameAction enum value)"],"output_types":["observation (updated task state)","done (boolean episode termination flag)","implicit_feedback (inferred from state change)"],"categories":["automation-workflow","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_11","uri":"capability://automation.workflow.open.source.benchmark.ecosystem","name":"open-source-benchmark-ecosystem","description":"Provides open-source access to benchmark tasks, evaluation infrastructure, and reference implementations, enabling community-driven research and algorithm development. The benchmark is published on GitHub with MIT license (implied by open-source claim), supporting reproducibility, contribution, and derivative work. Foundation explicitly emphasizes 'open-source ecosystem' and rewards open-source contributions through ARC Prize 2026.","intents":["Access benchmark tasks and evaluation code for reproducible research","Contribute novel algorithms and improvements to the benchmark ecosystem","Build derivative tools and analysis frameworks on top of the benchmark","Participate in community-driven research and algorithm development"],"best_for":["Academic researchers requiring reproducible benchmarks","Open-source contributors seeking community-driven projects","Teams building custom evaluation frameworks and analysis tools"],"limitations":["License terms not explicitly specified; assumed MIT but not confirmed","Contribution guidelines not documented; unclear how to submit improvements","Repository structure and code organization not described","No mention of version control strategy or release cycle","Community governance model not specified; unclear how decisions are made"],"requires":["Git for cloning repository","Python 3.8+ for running code","GitHub account for contributions (optional)"],"input_types":["source code (Python SDK, evaluation scripts)","benchmark tasks (visual puzzle definitions)"],"output_types":["cloned repository (local copy of benchmark)","executable environment (installed SDK)"],"categories":["automation-workflow","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_2","uri":"capability://tool.use.integration.rest.api.based.remote.task.access","name":"rest-api-based-remote-task-access","description":"Exposes benchmark tasks and evaluation through a REST API (documented at https://docs.arcprize.org) with API key authentication, enabling remote task access without local installation. The API abstracts task execution and scoring, allowing integration into web-based systems, cloud pipelines, and multi-language environments. Authentication uses API keys (with anonymous access available but limited).","intents":["Integrate ARC-AGI evaluation into cloud-based agent systems without local compute","Build web dashboards or monitoring systems that query task state and scores remotely","Enable multi-language agent implementations (non-Python) to access benchmark tasks","Participate in ARC Prize 2026 competition with remote submission and evaluation"],"best_for":["Teams with cloud-native architectures (Kubernetes, serverless)","Multi-language projects requiring language-agnostic benchmark access","Researchers without local compute resources or storage capacity","Competition participants requiring standardized evaluation infrastructure"],"limitations":["API rate limits and quota not documented; potential bottleneck for high-throughput evaluation","Network latency adds overhead vs. local execution; exact latency not quantified","API key management required; no mention of key rotation, expiration, or revocation mechanisms","REST API specification not provided in source material; reverse-engineering required","Anonymous access limitations not specified; may restrict task access or evaluation frequency"],"requires":["HTTP client library (curl, requests, etc.)","ARC_API_KEY environment variable or explicit header authentication","Network connectivity to arcprize.org API endpoint"],"input_types":["HTTP POST/GET requests","task_id parameter","GameAction serialization (format not specified)"],"output_types":["JSON response (structure not documented)","scorecard data (format not specified)","task state representation (format not specified)"],"categories":["tool-use-integration","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_3","uri":"capability://planning.reasoning.abstract.pattern.recognition.evaluation","name":"abstract-pattern-recognition-evaluation","description":"Measures an AI system's ability to recognize and generalize abstract patterns from minimal examples (1-5 training demonstrations) without domain-specific knowledge or pre-training on similar tasks. Evaluation is based on whether agents can infer transformation rules, spatial relationships, and logical operations from limited visual evidence and apply them to novel test cases. This capability directly measures fluid intelligence and learning efficiency rather than memorized knowledge.","intents":["Assess whether an AI system exhibits genuine learning and generalization vs. pattern matching on familiar domains","Measure how efficiently a system learns new abstract concepts from minimal examples","Evaluate reasoning about spatial transformations, color operations, and logical rules","Identify gaps in current AI capabilities relative to human-level abstract reasoning"],"best_for":["AGI researchers measuring progress toward general intelligence","Teams developing reasoning-focused models (not retrieval-based)","Benchmark designers seeking tasks that resist scaling-only solutions"],"limitations":["No quantitative performance baselines provided; cannot assess actual difficulty or SOTA progress","Scoring methodology not formally specified; scorecard structure undefined","Task composition (grid sizes, color palettes, transformation types) not fully documented","No evidence of correlation with real-world reasoning tasks or downstream performance","High contamination risk in active competition; no task rotation or dynamic generation mentioned","Ceiling effects not quantified; gap between SOTA and human performance unknown"],"requires":["Agent capable of visual perception and state representation","Planning/reasoning capability to infer patterns from examples","Memory mechanism to retain learned patterns across test cases"],"input_types":["visual_grid (abstract puzzle representation)","training_examples (1-5 demonstrations of input-output pairs)","test_input (novel puzzle requiring pattern application)"],"output_types":["test_output (predicted solution grid)","correctness_flag (boolean success/failure)","scorecard (aggregated performance metrics)"],"categories":["planning-reasoning","benchmark"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_4","uri":"capability://planning.reasoning.agent.memory.and.goal.acquisition","name":"agent-memory-and-goal-acquisition","description":"Supports agent memory persistence and goal acquisition across action sequences, enabling agents to maintain state, learn from observations, and dynamically discover task objectives. The Percept → Plan → Action cycle allows agents to accumulate knowledge across multiple steps, with memory mechanisms enabling pattern recognition and strategy refinement. Goals are not explicitly provided; agents must infer them from task structure and feedback.","intents":["Enable agents to learn and refine strategies across multiple action steps","Support agents in discovering task objectives through exploration and observation","Measure how effectively agents utilize memory to improve performance on repeated or similar tasks","Evaluate agents' ability to maintain context and adapt behavior based on accumulated experience"],"best_for":["Agents with internal state management and learning mechanisms","Teams developing reinforcement learning or planning-based approaches","Researchers studying how agents discover and pursue implicit goals"],"limitations":["Memory mechanism details not documented; capacity limits and persistence model unknown","Goal acquisition process not formally specified; no guidance on how agents should infer objectives","No explicit reward signal or feedback mechanism documented; agents must infer success from state changes","Memory reset policy between tasks not specified; unclear if memory persists across task boundaries","No quantified metrics for memory efficiency or goal discovery speed"],"requires":["Agent with internal state representation and update mechanism","Observation processing capability to extract patterns from percepts","Planning/reasoning module to map observations to actions"],"input_types":["percept (current task state observation)","history (previous observations and actions)","implicit_goal (inferred from task structure)"],"output_types":["action (discrete GameAction)","updated_memory (agent's internal state)","goal_hypothesis (agent's inferred objective)"],"categories":["planning-reasoning","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_5","uri":"capability://automation.workflow.configurable.rendering.and.visualization","name":"configurable-rendering-and-visualization","description":"Provides dual rendering modes for task visualization: terminal-based text rendering for human inspection and programmatic access (no rendering) for high-performance evaluation. Terminal mode enables visual debugging and human understanding of task state, while the no-render mode optimizes for throughput (2000+ FPS) by eliminating rendering overhead. Rendering mode is configurable per task instantiation.","intents":["Debug agent behavior by visualizing task state in human-readable terminal format","Optimize evaluation throughput for batch evaluation by disabling rendering","Inspect puzzle structure and transformation rules during development","Balance interpretability and performance based on evaluation phase (development vs. production)"],"best_for":["Developers debugging agent behavior during development","Teams running large-scale batch evaluations requiring maximum throughput","Researchers analyzing task structure and agent decision-making"],"limitations":["Terminal rendering adds latency; exact overhead not quantified","Terminal mode limited to text-based visualization; no graphical output","Rendering mode must be specified at task instantiation; cannot toggle mid-episode","No custom rendering plugins or visualization frameworks documented","Performance (2K+ FPS) only achieved with render_mode=None; terminal mode performance unknown"],"requires":["Terminal emulator supporting ANSI color codes (for terminal render mode)","Python SDK with render_mode parameter support"],"input_types":["render_mode parameter (string: 'terminal' or None)","task_state (internal representation)"],"output_types":["terminal_output (ANSI-formatted text visualization)","structured_state (programmatic representation when render_mode=None)"],"categories":["automation-workflow","image-visual"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_6","uri":"capability://data.processing.analysis.scorecard.based.evaluation.aggregation","name":"scorecard-based-evaluation-aggregation","description":"Aggregates task performance into a structured scorecard that summarizes agent evaluation results across the benchmark. The scorecard is generated via arc.get_scorecard() and provides aggregated metrics, though the exact structure and metrics are not formally documented. Scorecard enables comparison across agents and tracking of performance progress.","intents":["Obtain aggregated performance metrics across all benchmark tasks","Compare agent performance against baselines and other systems","Track performance progress across training iterations or model versions","Generate standardized evaluation reports for publication or competition submission"],"best_for":["Researchers comparing agent performance quantitatively","Teams tracking progress across development iterations","Competition participants generating official evaluation reports"],"limitations":["Scorecard structure not formally specified; reverse-engineering required for custom analysis","Metrics included in scorecard not documented (accuracy, success rate, etc.)","No guidance on statistical significance, confidence intervals, or error bars","Aggregation method (mean, median, weighted) not specified","No breakdown by task difficulty, category, or other dimensions documented"],"requires":["Completed task evaluations (via env.step() calls)","Python SDK with get_scorecard() method"],"input_types":["evaluation_history (completed task results)"],"output_types":["scorecard (structured evaluation summary)","metrics (aggregated performance numbers)"],"categories":["data-processing-analysis","benchmark"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_7","uri":"capability://automation.workflow.arc.prize.2026.competition.integration","name":"arc-prize-2026-competition-integration","description":"Integrates with the ARC Prize 2026 competition infrastructure, enabling researchers to submit solutions, receive evaluation on held-out test sets, and compete for $2M in prizes. Competition is hosted on Kaggle and provides standardized submission mechanisms, leaderboard tracking, and prize distribution. The foundation rewards open-source contributions and novel algorithmic progress.","intents":["Submit agent solutions to official ARC Prize 2026 competition","Receive evaluation on held-out test sets not available in public benchmark","Compete for prize money and recognition for novel reasoning approaches","Contribute open-source solutions and receive rewards for progress"],"best_for":["Researchers developing novel reasoning algorithms","Teams with resources to compete in high-stakes benchmarks","Open-source contributors seeking funding for AI research"],"limitations":["Submission mechanism not formally documented; reverse-engineering required","Prize distribution criteria not specified; unclear how $2M is allocated","Evaluation timeline not documented; turnaround time for submissions unknown","Held-out test set composition not disclosed; potential for distribution shift","Competition rules and eligibility criteria not provided in source material","Leaderboard contents not accessible; cannot assess current SOTA or competition status"],"requires":["Kaggle account registration","Submission in format specified by competition (format not documented)","Compliance with competition rules and open-source requirements"],"input_types":["agent_solution (format not specified)","submission_metadata (team info, approach description)"],"output_types":["leaderboard_ranking (position and score)","evaluation_report (performance on held-out tasks)","prize_eligibility (if applicable)"],"categories":["automation-workflow","benchmark"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_8","uri":"capability://automation.workflow.task.id.based.environment.instantiation","name":"task-id-based-environment-instantiation","description":"Enables task instantiation by task ID (e.g., 'ls20', 'ft09') through the Arcade.make() factory method, abstracting task loading and initialization. Task IDs map to specific puzzle instances in the benchmark, allowing reproducible task selection and batch evaluation. The factory pattern supports configurable rendering modes and other task parameters.","intents":["Load specific benchmark tasks by ID for reproducible evaluation","Iterate over task sets for batch evaluation","Enable task-specific analysis and debugging","Support reproducible research by specifying exact task instances"],"best_for":["Researchers requiring reproducible task selection","Teams running batch evaluations across task subsets","Developers debugging specific task instances"],"limitations":["Task ID enumeration not provided; full list of available tasks unknown","Task ID naming convention not documented; discovery mechanism unclear","No task metadata (difficulty, category, transformation type) accessible via ID","Task loading mechanism not specified; potential for version mismatches","No task filtering or search capability documented"],"requires":["Valid task ID (format: string, e.g., 'ls20')","Python SDK with Arcade.make() method","Optional: render_mode parameter"],"input_types":["task_id (string identifier)","render_mode (optional, string)"],"output_types":["environment (GameAction-compatible task instance)"],"categories":["automation-workflow","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__cap_9","uri":"capability://tool.use.integration.gameaction.discrete.action.space","name":"gameaction-discrete-action-space","description":"Defines a discrete action space through the GameAction enum, enabling agents to interact with tasks through a fixed set of predefined actions. Actions are specified as enum values (e.g., GameAction.ACTION1) and passed to env.step(), abstracting the underlying action semantics. The action space is task-agnostic, supporting a consistent interface across all benchmark tasks.","intents":["Provide agents with a standardized action interface across all benchmark tasks","Enable discrete action selection for planning and reinforcement learning approaches","Abstract task-specific action semantics behind a consistent enum interface","Support action logging and reproducibility through enum-based action specification"],"best_for":["Agents using discrete action selection (planning, RL, search)","Teams requiring consistent action interfaces across task variations","Researchers analyzing action sequences and decision-making"],"limitations":["GameAction enum values not documented; action semantics unknown","Action space size not specified; unclear how many actions are available","No continuous action support; agents limited to discrete choices","Action semantics may vary across tasks; no task-specific action documentation","No action masking or validity checking documented; agents may select invalid actions"],"requires":["Python SDK with GameAction enum","Agent capable of selecting enum values"],"input_types":["GameAction enum value"],"output_types":["observation (task state after action)","done flag (episode termination)","implicit_reward (inferred from state change)"],"categories":["tool-use-integration","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"arc-agi__headline","uri":"capability://testing.quality.general.intelligence.benchmark.for.ai.systems","name":"general intelligence benchmark for ai systems","description":"The ARC-AGI benchmark is designed to evaluate general intelligence in AI systems through unique visual puzzles that require abstract reasoning and pattern recognition, offering a $1M prize for solutions that match human performance.","intents":["best AI benchmark for general intelligence","benchmark for evaluating AI reasoning skills","top visual puzzle benchmarks for AI","AI systems performance evaluation tools","general intelligence testing frameworks for AI"],"best_for":["researchers in AI","developers testing AI capabilities"],"limitations":["does not measure emotional intelligence"],"requires":["Python environment for execution"],"input_types":["visual puzzles"],"output_types":["performance scores"],"categories":["testing-quality"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":62,"verified":false,"data_access_risk":"high","permissions":["Python 3.8+","arc-agi package (installable via pip install arc-agi or uv add arc-agi)","Optional: ARC_API_KEY environment variable for public game access (anonymous key available with limitations)","pip or uv package manager","Optional: .env file with ARC_API_KEY for full feature access","Environment instance (via Arcade.make())","GameAction enum value for action selection","Git for cloning repository","Python 3.8+ for running code","GitHub account for contributions (optional)"],"failure_modes":["Visual-only format excludes language-based reasoning; no text input/output in puzzle solving","Task specifics (grid dimensions, color palettes, transformation rules) not fully documented in public materials","No dynamic task generation or rotation mentioned; contamination risk high in active competition environment","Evaluation protocol and statistical rigor not formally specified; no confidence intervals or significance testing documented","Single-agent focus; does not measure multi-agent coordination or collaborative reasoning","Local execution requires sufficient disk space and memory for all task environments","Rendering mode (terminal) adds latency; exact overhead not quantified","No built-in sandboxing or resource limits documented; malicious agents could consume unbounded compute","API key optional but recommended; anonymous access has undocumented limitations","Scorecard output structure not formally specified; reverse-engineering required for custom metrics","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.9,"ecosystem":0.3,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:19.836Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=arc-agi","compare_url":"https://unfragile.ai/compare?artifact=arc-agi"}},"signature":"ltlaAocgJS1HMm+bzCpDg3IBKMBABeDjTK/A4AM8bENTULTIQ38WaKn84YL+G3Yzc5EloAX7y6VQV7i4NUyaDA==","signedAt":"2026-06-22T22:03:11.355Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/arc-agi","artifact":"https://unfragile.ai/arc-agi","verify":"https://unfragile.ai/api/v1/verify?slug=arc-agi","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}