{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"osworld","slug":"osworld","name":"OSWorld","type":"benchmark","url":"https://os-world.github.io","page_url":"https://unfragile.ai/osworld","categories":["testing-quality"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"osworld__cap_0","uri":"capability://planning.reasoning.real.environment.gui.interaction.evaluation","name":"real-environment gui interaction evaluation","description":"Evaluates multimodal agents' ability to interact with actual operating system graphical interfaces across Ubuntu, Windows, and macOS by executing tasks that require screenshot understanding, mouse/keyboard simulation, and application navigation. Uses custom execution-based evaluation scripts per task that capture initial OS state, execute agent actions, and verify task completion against ground truth outcomes in real sandboxed environments.","intents":["Measure how well AI agents can understand and interact with real desktop GUIs without synthetic abstractions","Test multimodal agent performance on tasks requiring visual perception of UI elements and contextual application of computer skills","Evaluate agent generalization across different operating systems and application ecosystems","Benchmark progress on the gap between human computer task completion (72.36%) and current AI agent capabilities (12.24%)"],"best_for":["AI research teams developing multimodal agents and evaluating GUI understanding capabilities","Companies building autonomous desktop automation tools and needing realistic performance baselines","Researchers studying human-computer interaction and agent behavior in real OS environments"],"limitations":["Evaluation requires actual OS execution in sandboxed VMs, making local evaluation computationally expensive and time-consuming (reduced to ~1 hour with AWS support as of 2025-07-28, but previously significantly longer)","8 of 369 tasks excluded from usable benchmark due to network dependencies requiring manual configuration, reducing effective test set to 361 tasks","No specification of train/dev/test split or data contamination analysis — tasks derived from real-world use cases may overlap with web-scraped LLM training data","Scoring methodology not fully detailed in documentation — unclear whether success is binary, graduated, or includes partial credit; timeout thresholds not specified","No failure mode analysis provided — unclear which task categories agents struggle with most (by OS, application type, or complexity)"],"requires":["Linux/Ubuntu, Windows, or macOS operating system for task execution","Virtualization or containerization infrastructure for sandboxed task execution","Screenshot capture and GUI automation capability (keyboard/mouse simulation or OS-level API access)","Multimodal agent capable of vision-language understanding and action generation","AWS account for accelerated evaluation (optional; local execution supported but slower)"],"input_types":["task descriptions (natural language)","initial OS state configuration (snapshots, file structures, application states)","screenshots from running OS"],"output_types":["binary success/failure verdict per task","execution trace (sequence of agent actions)","performance metrics (success rate, action count, time-to-completion)"],"categories":["planning-reasoning","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_1","uri":"capability://planning.reasoning.multi.os.task.distribution.and.evaluation","name":"multi-os task distribution and evaluation","description":"Distributes 369 benchmark tasks across three operating systems (Ubuntu, Windows, macOS) with OS-specific initial state configurations and evaluation scripts. Each task includes a detailed setup configuration that establishes the OS environment, file structures, and application states before agent execution, enabling reproducible evaluation of agent performance across platform-specific UI paradigms and application ecosystems.","intents":["Test whether AI agents generalize across different operating systems or develop OS-specific biases","Evaluate agent performance on platform-specific applications and UI conventions (e.g., GNOME vs Windows Explorer vs Finder)","Identify which OS environments present the greatest challenges for multimodal agent understanding","Benchmark agent capability on cross-platform workflows that require knowledge of OS-specific file systems and application behavior"],"best_for":["Teams building cross-platform automation tools who need OS-agnostic agent evaluation","Researchers studying how agent architecture and training data affect OS-specific performance","Organizations deploying agents in heterogeneous enterprise environments with mixed OS deployments"],"limitations":["Task distribution across Ubuntu, Windows, and macOS not specified in documentation — unclear if tasks are balanced or skewed toward one OS","No analysis of OS-specific difficulty or bias — cannot determine if certain OS tasks are inherently harder or if agents have platform-specific blind spots","Initial state setup complexity varies per task but is not quantified — some tasks may require complex multi-application state that is harder to reproduce","Reproducibility depends on exact OS version and application versions — minor OS updates or application patches may invalidate initial state configurations"],"requires":["Ubuntu, Windows, and macOS virtual machines or containers for task execution","Ability to snapshot and restore OS state between task runs","Application installation and configuration capability for each OS platform","OS-specific automation APIs or GUI interaction libraries (e.g., xdotool for Linux, pyautogui for cross-platform)"],"input_types":["OS-specific initial state configuration (file structures, application installations, user settings)","task descriptions with OS-specific context"],"output_types":["per-OS success rates and performance metrics","OS-specific failure modes and agent behavior traces"],"categories":["planning-reasoning","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_10","uri":"capability://image.visual.gui.grounding.and.visual.understanding.evaluation","name":"gui grounding and visual understanding evaluation","description":"Evaluates agent capability to understand and interact with graphical user interfaces by analyzing screenshots and identifying UI elements, buttons, menus, and text fields. Tests agent ability to visually ground task instructions in the actual UI state, a capability identified as a key limitation in current agents.","intents":["Measure agent capability to understand GUI layouts and identify interactive elements from screenshots","Identify gaps in visual grounding that prevent agents from correctly interpreting UI state","Evaluate vision-language model capability on practical GUI understanding tasks","Track progress in GUI grounding as vision models improve"],"best_for":["Teams developing vision-language models for GUI understanding","Researchers studying visual grounding in multimodal agents","Organizations building GUI automation tools that rely on visual understanding"],"limitations":["GUI grounding identified as a key agent limitation but no detailed analysis provided — unclear what specific aspects of GUI understanding agents struggle with (element detection, text recognition, layout understanding, etc.)","No task taxonomy by GUI complexity or application type — unclear if certain GUIs are harder to understand than others","No analysis of vision model performance on GUI understanding — unclear which vision models are better at GUI grounding","Screenshot resolution and quality not specified — unclear if low-resolution or high-resolution screenshots are used"],"requires":["Vision-language model capable of analyzing screenshots","Screenshot capture from running OS","Ability to identify UI elements and their properties from visual input"],"input_types":["screenshots from running applications","task descriptions requiring GUI understanding"],"output_types":["GUI element identification and interaction","task completion verdict based on visual understanding"],"categories":["image-visual","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_11","uri":"capability://planning.reasoning.operational.knowledge.and.application.expertise.evaluation","name":"operational knowledge and application expertise evaluation","description":"Evaluates agent capability to understand how to use applications and perform operations within them, testing knowledge of application-specific workflows, menu structures, keyboard shortcuts, and domain-specific operations. Identified as a key limitation in current agents alongside GUI grounding.","intents":["Measure agent capability to understand application-specific workflows and operations","Identify gaps in agent knowledge about how to use common desktop and web applications","Evaluate whether agents can learn and adapt to unfamiliar applications","Track progress in operational knowledge as agents are trained on more diverse applications"],"best_for":["Teams training agents on diverse applications and evaluating operational knowledge","Researchers studying how agents acquire and apply domain-specific knowledge","Organizations deploying agents on applications they use and needing to assess agent capability"],"limitations":["Operational knowledge identified as a key limitation but no detailed analysis provided — unclear what specific operations agents struggle with","No task taxonomy by application type or complexity — unclear if agents struggle more with certain applications","No analysis of agent performance on familiar vs unfamiliar applications — unclear if agents can generalize to new applications","No specification of which applications are included in benchmark — unclear what domain knowledge is required"],"requires":["Agent trained on diverse applications or capable of learning application workflows","Knowledge of application-specific operations and workflows","Ability to understand and execute domain-specific tasks"],"input_types":["task descriptions requiring application-specific knowledge","screenshots from applications"],"output_types":["application operation execution","task completion verdict based on operational knowledge"],"categories":["planning-reasoning","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_2","uri":"capability://planning.reasoning.custom.execution.based.task.evaluation","name":"custom execution-based task evaluation","description":"Implements task-specific evaluation scripts that execute agent actions against real OS state and verify completion by checking file system changes, application state modifications, and other observable outcomes. Each of the 369 tasks includes a custom evaluation script that defines success criteria, captures execution traces, and produces reproducible verdicts independent of agent architecture or implementation details.","intents":["Define ground truth for task completion in real OS environments where success cannot be determined by simple string matching","Enable reproducible evaluation across different agent implementations and architectures","Capture task-specific success criteria that vary by task complexity and domain (e.g., file operations vs web application use)","Provide detailed execution traces for failure analysis and agent debugging"],"best_for":["Benchmark designers who need flexible, task-specific evaluation logic beyond binary pass/fail","Agent developers who need detailed execution traces and failure diagnostics","Researchers studying agent behavior on complex, multi-step tasks with domain-specific success criteria"],"limitations":["Scoring methodology not fully specified in documentation — unclear whether success is binary, graduated, or includes partial credit for partial task completion","No standardized evaluation script format or schema documented — each script is custom, making it difficult to understand evaluation logic without reading source code","Timeout thresholds and resource limits not specified — unclear how long agents are allowed to execute before evaluation terminates","No inter-rater reliability or evaluation script validation reported — unclear if evaluation scripts consistently and fairly assess task completion","Evaluation script maintenance burden increases with benchmark size — 369 custom scripts require ongoing updates as applications and OS versions change"],"requires":["Custom evaluation script implementation for each task (language/framework not specified in documentation)","Ability to inspect OS state (file system, running processes, application state) after agent execution","Deterministic or reproducible task outcomes (some tasks may have stochastic elements that complicate evaluation)"],"input_types":["agent action trace (sequence of GUI interactions, keyboard/mouse events)","final OS state (file system, application state, window state)"],"output_types":["success/failure verdict","partial credit score (if applicable)","execution trace with timestamps and action details"],"categories":["planning-reasoning","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_3","uri":"capability://planning.reasoning.real.world.task.scenario.grounding","name":"real-world task scenario grounding","description":"Grounds benchmark tasks in real-world computer use cases derived from actual user workflows, file management operations, application usage patterns, and multi-app interactions. Tasks are not synthetic or artificially constructed but represent genuine computer tasks that users perform, including file organization, document editing, web browsing, email management, and cross-application data workflows.","intents":["Evaluate agent performance on tasks that correlate with actual user needs and real-world computer use","Test agent capability on diverse application ecosystems and workflow patterns beyond narrow synthetic domains","Identify gaps between agent capability and practical utility for autonomous desktop automation","Benchmark progress on tasks that matter to end users, not just tasks that are easy to evaluate"],"best_for":["Teams building practical autonomous desktop automation tools who need realistic performance baselines","Researchers studying the gap between benchmark performance and real-world agent utility","Organizations evaluating whether AI agents can handle their actual computer workflows"],"limitations":["No validation study comparing benchmark performance to actual user task completion — real-world correlation is claimed but not empirically validated","No analysis of task distribution vs actual computer use frequency — unclear if benchmark tasks represent common vs rare use cases","HIGH RISK of data contamination — tasks involve 'arbitrary apps' and 'real web and desktop apps' likely to overlap with web-scraped LLM training data; no statement on whether task descriptions, screenshots, or workflows appear in training corpora","No task taxonomy or categorization provided — unclear how 369 tasks are distributed across application types, complexity levels, or use case domains","Task descriptions not provided in documentation — cannot independently verify real-world relevance or assess potential training data overlap"],"requires":["Access to diverse real-world applications and workflows","Domain expertise to identify representative computer tasks","Ability to set up realistic initial states that match actual user scenarios"],"input_types":["real-world task descriptions","initial OS state matching realistic user scenarios"],"output_types":["task completion verdict","performance metrics on realistic workflows"],"categories":["planning-reasoning","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_4","uri":"capability://planning.reasoning.multimodal.agent.performance.benchmarking","name":"multimodal agent performance benchmarking","description":"Provides standardized evaluation infrastructure for measuring multimodal agent performance (combining vision and language understanding) on computer task completion. Establishes baseline human performance (72.36% success rate) and current state-of-the-art model performance (12.24% success rate), quantifying the gap between human and AI agent capability on real OS tasks.","intents":["Measure progress in multimodal agent development by tracking success rates on a standardized benchmark","Identify specific capability gaps in current agents (GUI grounding, operational knowledge) that limit performance","Compare different agent architectures and training approaches on a common evaluation framework","Track improvement over time as agents and models evolve"],"best_for":["AI research teams developing and comparing multimodal agents","Model developers evaluating vision-language model improvements on practical tasks","Organizations tracking progress toward human-level autonomous desktop automation"],"limitations":["SOTA model names and architectures not specified in documentation — cannot independently verify claims or reproduce results","No statistical significance testing or confidence intervals reported — unclear if performance differences between models are meaningful","No failure analysis or error categorization — unclear which types of tasks agents struggle with most (GUI grounding vs operational knowledge vs other factors)","Benchmark may be too hard or agents too limited — 60.12 percentage point gap between human (72.36%) and SOTA (12.24%) suggests either benchmark is unreasonably difficult or agents lack fundamental capabilities","No leaderboard transparency — top performing models not publicly listed, making it difficult to track progress or compare approaches","Train/test split not specified — unclear if models are evaluated on held-out tasks or if there is potential for overfitting"],"requires":["Multimodal agent capable of vision-language understanding and action generation","Ability to execute tasks on real OS environments","Evaluation infrastructure to run benchmark and score results"],"input_types":["multimodal agent implementation","task descriptions and initial OS states"],"output_types":["success rate (percentage of tasks completed)","per-task verdicts and execution traces","performance comparison against baselines"],"categories":["planning-reasoning","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_5","uri":"capability://search.retrieval.interactive.benchmark.data.viewer","name":"interactive benchmark data viewer","description":"Provides a web-based interactive viewer for exploring benchmark tasks, initial states, expected outcomes, and evaluation results. Enables researchers and developers to inspect individual tasks, understand evaluation criteria, and analyze agent performance without requiring local execution of the full benchmark infrastructure.","intents":["Explore benchmark tasks and understand what agents are being evaluated on","Analyze specific task failures and understand why agents struggle with particular tasks","Inspect initial OS states and expected outcomes to verify task correctness","Share benchmark insights and task examples with collaborators without requiring infrastructure setup"],"best_for":["Researchers analyzing benchmark results and identifying patterns in agent failures","Developers debugging agent behavior on specific tasks","Teams collaborating on agent development who need to share task understanding"],"limitations":["Viewer functionality not detailed in documentation — unclear what data is exposed (task descriptions, screenshots, evaluation scripts, agent traces, etc.)","No filtering or search capabilities documented — unclear how users navigate 369 tasks","No export functionality mentioned — unclear if results can be downloaded for analysis","Interactive viewer may have performance limitations with large task set — unclear how responsive the interface is with 369 tasks"],"requires":["Web browser with JavaScript support","Internet connection to access hosted viewer"],"input_types":["benchmark task data","evaluation results"],"output_types":["interactive web interface","task details and metadata","evaluation results and agent traces"],"categories":["search-retrieval","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_6","uri":"capability://automation.workflow.aws.accelerated.benchmark.evaluation","name":"aws-accelerated benchmark evaluation","description":"Integrates with AWS infrastructure to accelerate benchmark evaluation, reducing full benchmark execution time to approximately 1 hour (as of 2025-07-28 update). Leverages cloud VM provisioning and parallel task execution to speed up evaluation compared to local execution, enabling faster iteration and result collection.","intents":["Reduce benchmark evaluation time from unknown baseline to ~1 hour for faster agent development iteration","Enable parallel task execution across multiple AWS instances to speed up evaluation","Provide cloud-based evaluation infrastructure for teams without local VM resources","Track evaluation time improvements as benchmark infrastructure evolves"],"best_for":["Teams with AWS accounts who want to evaluate agents quickly without local infrastructure","Researchers iterating rapidly on agent improvements and needing fast feedback","Organizations running benchmark evaluations at scale across multiple agent variants"],"limitations":["AWS evaluation time reduced to ~1 hour as of 2025-07-28, but baseline evaluation time before this update is unknown — unclear how much improvement this represents","AWS cost not specified in documentation — unclear what the financial cost of 1-hour evaluation is","No details on parallelization strategy or resource allocation — unclear how many AWS instances are used or how tasks are distributed","AWS support is optional; local evaluation still supported but slower — no performance comparison provided between AWS and local evaluation","Requires AWS account and credentials — adds dependency on external cloud provider"],"requires":["AWS account with appropriate IAM permissions","AWS credentials configured for benchmark execution","Network connectivity to AWS"],"input_types":["agent implementation","AWS configuration (instance types, region, etc.)"],"output_types":["evaluation results","execution time metrics","AWS cost estimates (if available)"],"categories":["automation-workflow","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_7","uri":"capability://automation.workflow.benchmark.versioning.and.continuous.improvement","name":"benchmark versioning and continuous improvement","description":"Maintains versioned benchmark releases with documented improvements and bug fixes. The 2025-07-28 update introduced 'OSWorld-Verified' with comprehensive improvements including community-reported example fixes and AWS acceleration, indicating active maintenance and responsiveness to feedback.","intents":["Track benchmark improvements and understand how evaluation criteria evolve over time","Ensure reproducibility by specifying which benchmark version was used for evaluation","Benefit from community feedback and bug fixes in newer benchmark versions","Compare results across benchmark versions to understand impact of improvements"],"best_for":["Researchers publishing results who need to specify benchmark version for reproducibility","Teams tracking progress over time and wanting to account for benchmark improvements","Contributors who want to report issues and see them addressed in future releases"],"limitations":["Earlier benchmark version had bugs/issues that were fixed in 2025-07-28 update — results from pre-2025-07-28 may not be comparable to current results","No detailed changelog provided in documentation — unclear what specific bugs were fixed or what 'comprehensive improvements' entail","No migration guide for updating from older to newer benchmark versions — unclear how to re-evaluate agents on new version","No backward compatibility statement — unclear if older evaluation scripts work with newer benchmark versions"],"requires":["Specification of benchmark version when reporting results","Ability to update to new benchmark versions for reproducibility"],"input_types":["benchmark version identifier"],"output_types":["version-specific evaluation results","changelog and improvement notes"],"categories":["automation-workflow","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_8","uri":"capability://tool.use.integration.open.source.benchmark.infrastructure","name":"open-source benchmark infrastructure","description":"Provides open-source access to benchmark code, evaluation scripts, task data, and documentation, enabling independent verification, extension, and reproduction of benchmark results. All components (code, documentation, data, viewer) are publicly available, supporting transparency and community contribution.","intents":["Enable independent verification of benchmark results and evaluation methodology","Allow researchers to extend benchmark with new tasks or evaluation criteria","Support reproduction of benchmark results without proprietary dependencies","Foster community contributions and improvements to benchmark infrastructure"],"best_for":["Researchers who need to verify benchmark methodology and results","Teams extending benchmark with new tasks or OS platforms","Organizations building on top of benchmark infrastructure for custom evaluations","Open-source communities contributing improvements and bug fixes"],"limitations":["Language/framework of reference implementation not specified in documentation — unclear what programming language or dependencies are required","No contribution guidelines or community process documented — unclear how to submit improvements or new tasks","Open-source license not specified — unclear what restrictions apply to use and modification","Documentation completeness unknown — open-source code may lack detailed comments or architecture documentation","Maintenance burden on open-source community — unclear who maintains code and how responsive they are to issues"],"requires":["Git access to clone benchmark repository","Programming language and dependencies required by reference implementation (not specified)"],"input_types":["benchmark source code","task definitions and evaluation scripts","documentation"],"output_types":["executable benchmark infrastructure","evaluation results","extended benchmark variants"],"categories":["tool-use-integration","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__cap_9","uri":"capability://planning.reasoning.multi.application.workflow.evaluation","name":"multi-application workflow evaluation","description":"Evaluates agent capability on tasks requiring interaction across multiple applications and OS-level file I/O operations, not just single-application tasks. Tasks include workflows that span web browsers, desktop applications, file managers, and system utilities, testing agent ability to coordinate actions across application boundaries and manage cross-app data flow.","intents":["Test agent capability on realistic workflows that require coordination across multiple applications","Evaluate agent understanding of file systems, data formats, and inter-application data exchange","Identify gaps in agent ability to manage complex, multi-step workflows spanning application boundaries","Benchmark progress on practical automation scenarios that require cross-app coordination"],"best_for":["Teams building practical automation tools that need to orchestrate workflows across multiple applications","Researchers studying agent capability on complex, multi-step tasks","Organizations evaluating whether agents can handle their actual business workflows"],"limitations":["No task taxonomy or categorization by workflow complexity — unclear which tasks are single-app vs multi-app or how many applications are involved","No analysis of multi-app task performance vs single-app performance — unclear if agents struggle more with cross-app coordination","No specification of supported applications or application categories — unclear which applications are included in benchmark","Application version dependencies not documented — unclear if tasks require specific application versions or if they work with multiple versions"],"requires":["Multiple applications installed and configured in OS environment","Agent capability to switch between applications and manage cross-app state","File system access for inter-application data exchange"],"input_types":["multi-application task descriptions","initial OS state with multiple applications configured"],"output_types":["multi-app workflow completion verdict","cross-app action traces"],"categories":["planning-reasoning","benchmark-evaluation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"osworld__headline","uri":"capability://testing.quality.benchmark.for.evaluating.multimodal.agents.in.real.computer.tasks","name":"benchmark for evaluating multimodal agents in real computer tasks","description":"OSWorld is a benchmark designed to evaluate multimodal agents performing real-world computer tasks across Ubuntu, Windows, and macOS, focusing on file management and multi-app workflows.","intents":["best benchmark for multimodal agents","benchmark for evaluating computer task performance","multimodal agent evaluation across operating systems","real-world task benchmark for AI agents","best tools for testing AI in real environments"],"best_for":["researchers in AI","developers testing multimodal agents"],"limitations":["does not measure internet-dependent tasks"],"requires":["real operating system environments"],"input_types":[],"output_types":[],"categories":["testing-quality"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":62,"verified":false,"data_access_risk":"high","permissions":["Linux/Ubuntu, Windows, or macOS operating system for task execution","Virtualization or containerization infrastructure for sandboxed task execution","Screenshot capture and GUI automation capability (keyboard/mouse simulation or OS-level API access)","Multimodal agent capable of vision-language understanding and action generation","AWS account for accelerated evaluation (optional; local execution supported but slower)","Ubuntu, Windows, and macOS virtual machines or containers for task execution","Ability to snapshot and restore OS state between task runs","Application installation and configuration capability for each OS platform","OS-specific automation APIs or GUI interaction libraries (e.g., xdotool for Linux, pyautogui for cross-platform)","Vision-language model capable of analyzing screenshots"],"failure_modes":["Evaluation requires actual OS execution in sandboxed VMs, making local evaluation computationally expensive and time-consuming (reduced to ~1 hour with AWS support as of 2025-07-28, but previously significantly longer)","8 of 369 tasks excluded from usable benchmark due to network dependencies requiring manual configuration, reducing effective test set to 361 tasks","No specification of train/dev/test split or data contamination analysis — tasks derived from real-world use cases may overlap with web-scraped LLM training data","Scoring methodology not fully detailed in documentation — unclear whether success is binary, graduated, or includes partial credit; timeout thresholds not specified","No failure mode analysis provided — unclear which task categories agents struggle with most (by OS, application type, or complexity)","Task distribution across Ubuntu, Windows, and macOS not specified in documentation — unclear if tasks are balanced or skewed toward one OS","No analysis of OS-specific difficulty or bias — cannot determine if certain OS tasks are inherently harder or if agents have platform-specific blind spots","Initial state setup complexity varies per task but is not quantified — some tasks may require complex multi-application state that is harder to reproduce","Reproducibility depends on exact OS version and application versions — minor OS updates or application patches may invalidate initial state configurations","GUI grounding identified as a key agent limitation but no detailed analysis provided — unclear what specific aspects of GUI understanding agents struggle with (element detection, text recognition, layout understanding, etc.)","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.9,"ecosystem":0.3,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:25.059Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=osworld","compare_url":"https://unfragile.ai/compare?artifact=osworld"}},"signature":"LVof04VVn1tZfAidp08iRwU5Qe/oTa2LPBKhqFJMht7Ir+0I2HbwEPAJRDmhRqg9NDr4Js8le7MiJz461jpfCQ==","signedAt":"2026-06-22T09:54:15.371Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/osworld","artifact":"https://unfragile.ai/osworld","verify":"https://unfragile.ai/api/v1/verify?slug=osworld","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}