{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hn-47784914","slug":"omi-watches-your-screen-hears-conversations-tells-","name":"Omi – watches your screen, hears conversations, tells you what to do","type":"agent","url":"https://github.com/BasedHardware/omi","page_url":"https://unfragile.ai/omi-watches-your-screen-hears-conversations-tells-","categories":["ai-agents"],"tags":["hackernews","show-hn"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hn-47784914__cap_0","uri":"capability://image.visual.real.time.screen.content.capture.and.analysis","name":"real-time screen content capture and analysis","description":"Continuously captures the active window or full screen at configurable intervals, processes frames through vision models (likely Claude Vision or similar), and extracts semantic understanding of UI state, text content, and visual context. Uses frame buffering and differential analysis to avoid redundant processing of unchanged screens, enabling efficient monitoring of user activity without overwhelming the inference pipeline.","intents":["understand what the user is currently working on without explicit logging","trigger context-aware actions based on detected screen state changes","build a visual memory of user workflow for retrospective analysis"],"best_for":["developers building context-aware AI agents","productivity researchers tracking user behavior","teams implementing ambient intelligence systems"],"limitations":["vision model inference latency creates 500ms-2s delay between screen change and detection","high token consumption for continuous frame analysis — may exceed API quotas on free tiers","privacy-sensitive: captures all screen content including passwords, private messages, and confidential data","no built-in redaction or PII filtering — requires external privacy layer"],"requires":["Python 3.8+","Vision-capable LLM API (OpenAI, Anthropic, or local vision model)","Screen capture permissions (OS-level on macOS/Windows/Linux)","GPU or sufficient CPU for frame processing"],"input_types":["screen frames (PNG/JPEG)","window metadata (title, process name)"],"output_types":["structured scene description (JSON)","detected UI elements and text","semantic understanding of current task"],"categories":["image-visual","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hn-47784914__cap_1","uri":"capability://text.generation.language.ambient.audio.capture.and.speech.to.text.transcription","name":"ambient audio capture and speech-to-text transcription","description":"Captures ambient audio from the device microphone in real-time, streams it to a speech-to-text engine (likely Whisper or similar), and converts spoken words into structured text with speaker identification when possible. Implements audio buffering and VAD (voice activity detection) to avoid processing silence, reducing API calls and latency. Maintains a rolling transcript window for context in subsequent reasoning steps.","intents":["transcribe conversations and meetings without manual note-taking","trigger actions based on spoken commands or context from nearby conversations","build a searchable log of what was discussed around the user"],"best_for":["remote workers in open offices wanting ambient awareness","meeting transcription and action item extraction","developers building voice-aware ambient agents"],"limitations":["background noise degrades transcription accuracy — typical WER 5-15% in noisy environments","no speaker diarization by default — cannot distinguish who said what in multi-person conversations","continuous audio processing creates significant privacy concerns and regulatory compliance issues (GDPR, CCPA, wiretapping laws)","latency of 2-5 seconds between speech and transcription availability"],"requires":["Microphone access with OS-level permissions","Speech-to-text API (OpenAI Whisper, Google Cloud Speech-to-Text, or local Whisper model)","Audio processing library (librosa, PyAudio, or similar)","Sufficient bandwidth for streaming audio if using cloud STT"],"input_types":["raw audio stream (WAV, PCM)","audio chunks (configurable buffer size, typically 1-5 seconds)"],"output_types":["transcribed text with timestamps","confidence scores per segment","structured transcript (JSON with timing metadata)"],"categories":["text-generation-language","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hn-47784914__cap_2","uri":"capability://memory.knowledge.multi.modal.context.aggregation.and.state.management","name":"multi-modal context aggregation and state management","description":"Fuses real-time screen captures, audio transcripts, and user interaction history into a unified context representation that the reasoning engine can query. Implements a sliding-window memory buffer (likely 5-30 minutes of recent context) with semantic indexing to enable efficient retrieval of relevant past states. Uses embeddings or keyword matching to surface contextually relevant information when the agent needs to reason about what the user is doing.","intents":["give the agent a coherent understanding of the user's current task and recent history","enable the agent to reference past context without re-processing all raw data","support multi-turn reasoning where the agent needs to understand how the current state relates to recent activity"],"best_for":["developers building stateful AI agents","teams implementing context-aware automation","researchers studying human-AI interaction patterns"],"limitations":["memory buffer size creates a hard cutoff — events older than the window are lost unless explicitly persisted","no built-in persistence layer — context is lost on agent restart unless external storage is added","semantic indexing adds 50-200ms latency per context query","no automatic privacy filtering — sensitive information (passwords, PII) remains in context unless explicitly redacted"],"requires":["Embedding model (OpenAI, Sentence Transformers, or local) for semantic indexing","In-memory data structure (likely dict/list in Python) or lightweight vector DB (Chroma, Pinecone)","Timestamp synchronization between screen and audio streams"],"input_types":["screen analysis results (JSON)","transcribed audio (text with timestamps)","user action logs (clicks, keystrokes, app switches)"],"output_types":["unified context object (JSON)","retrieved relevant past states","context summaries for reasoning engine"],"categories":["memory-knowledge","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hn-47784914__cap_3","uri":"capability://planning.reasoning.intent.detection.and.action.recommendation","name":"intent detection and action recommendation","description":"Analyzes aggregated context (screen state + transcript + history) through a reasoning model (likely Claude or GPT-4) to infer the user's current intent and recommend proactive actions. Uses chain-of-thought prompting to decompose the user's situation into actionable steps, then ranks recommendations by relevance and confidence. Implements a feedback loop where user acceptance/rejection of recommendations trains the ranking model.","intents":["automatically suggest next steps based on what the user is doing","proactively surface relevant information or tools before the user explicitly asks","reduce cognitive load by automating routine decision-making"],"best_for":["productivity-focused teams wanting ambient assistance","developers building proactive AI agents","power users comfortable with AI making suggestions"],"limitations":["reasoning model inference adds 2-5 second latency per recommendation cycle","high token consumption — continuous context analysis can exceed API quotas quickly","no guarantee of correct intent inference — hallucinations or misinterpretations can lead to irrelevant recommendations","no built-in user preference learning — recommendations don't improve over time without explicit feedback integration"],"requires":["Large language model API (OpenAI GPT-4, Anthropic Claude, or local LLM)","Prompt engineering for intent detection and action ranking","Optional: feedback collection mechanism (user thumbs up/down on recommendations)"],"input_types":["unified context object (screen + audio + history)","user feedback on past recommendations (optional)"],"output_types":["detected user intent (text description)","ranked list of recommended actions (JSON with confidence scores)","reasoning chain (for explainability)"],"categories":["planning-reasoning","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hn-47784914__cap_4","uri":"capability://tool.use.integration.tool.invocation.and.action.execution","name":"tool invocation and action execution","description":"Translates recommended actions into executable operations by mapping them to available tools (calendar APIs, email clients, code editors, web browsers, etc.). Implements a function-calling interface where the reasoning model can request tool execution with parameters, then executes those requests through OS-level automation (likely AppleScript on macOS, PowerShell on Windows, or D-Bus on Linux) or direct API calls. Includes safety checks to prevent unintended side effects (e.g., confirming before sending emails).","intents":["automatically execute recommended actions without manual user intervention","integrate with existing tools and workflows (calendar, email, code editors, etc.)","enable the agent to take concrete steps toward accomplishing user goals"],"best_for":["teams automating routine workflows","developers building autonomous agents","power users wanting hands-free operation"],"limitations":["tool availability varies by OS and installed applications — not all tools are available on all systems","API-based tool execution requires authentication and credentials management","OS-level automation (AppleScript, PowerShell) is fragile and breaks with UI changes","no built-in rollback mechanism — failed actions may leave the system in an inconsistent state","safety checks can create friction — users may disable them, increasing risk of unintended actions"],"requires":["OS-level automation capabilities (AppleScript, PowerShell, D-Bus, or similar)","API credentials for integrated tools (calendar, email, code hosting, etc.)","Tool-specific SDKs or REST API clients","User consent and permission grants for each tool"],"input_types":["action request with parameters (JSON)","tool identifier and method name"],"output_types":["execution result (success/failure)","tool response data (varies by tool)","side effect confirmation (for safety-critical actions)"],"categories":["tool-use-integration","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hn-47784914__cap_5","uri":"capability://safety.moderation.privacy.aware.data.retention.and.local.processing","name":"privacy-aware data retention and local processing","description":"Implements configurable data retention policies that control how long screen captures, audio transcripts, and context are stored locally before deletion. Supports optional local processing of sensitive operations (e.g., running Whisper locally instead of sending audio to the cloud) to minimize data transmission. Includes audit logging to track what data was captured, processed, and deleted, enabling compliance with privacy regulations.","intents":["comply with privacy regulations (GDPR, CCPA) by controlling data retention","minimize exposure of sensitive information by processing locally when possible","provide transparency and auditability of data handling practices"],"best_for":["enterprises with strict privacy and compliance requirements","teams handling sensitive data (healthcare, finance, legal)","privacy-conscious developers building ambient agents"],"limitations":["local processing of large models (Whisper, vision models) requires significant GPU/CPU resources","data retention policies create operational complexity — users must manage cleanup and storage","audit logging adds overhead and storage requirements","no automatic PII detection or redaction — requires manual configuration or external tools","compliance with regulations is user's responsibility — tool provides mechanisms but not guarantees"],"requires":["Local storage with sufficient capacity (10-100GB+ depending on retention policy)","Optional: GPU for local model inference (NVIDIA CUDA, Apple Metal, or similar)","Privacy policy and data handling procedures","Compliance expertise to configure retention policies correctly"],"input_types":["retention policy configuration (JSON)","data deletion requests","audit log queries"],"output_types":["audit logs (CSV or JSON)","data deletion confirmations","compliance reports"],"categories":["safety-moderation","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hn-47784914__cap_6","uri":"capability://memory.knowledge.user.feedback.integration.and.preference.learning","name":"user feedback integration and preference learning","description":"Collects explicit user feedback (thumbs up/down, corrections, rejections) on agent recommendations and uses this to refine future suggestions. Implements a lightweight preference model that tracks which types of recommendations the user accepts or rejects, enabling personalization without requiring full model retraining. Stores preferences locally and uses them to re-rank recommendations before presenting them to the user.","intents":["improve recommendation relevance over time based on user feedback","personalize the agent's behavior to match individual user preferences","reduce irrelevant suggestions through continuous learning"],"best_for":["long-term users wanting personalized assistance","teams deploying agents across multiple users with different preferences","developers building adaptive AI systems"],"limitations":["preference learning requires significant user feedback — cold-start problem for new users","no built-in mechanism to detect preference drift over time","local preference storage is not shared across devices — each device learns independently","no privacy-preserving federated learning — preferences are stored locally but not aggregated across users"],"requires":["Feedback collection UI (buttons, voice commands, or implicit signals)","Local preference storage (database or JSON file)","Recommendation re-ranking logic"],"input_types":["user feedback (accept/reject/correct)","recommendation ID and context"],"output_types":["updated preference model","re-ranked recommendations","personalization metrics"],"categories":["memory-knowledge","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hn-47784914__cap_7","uri":"capability://image.visual.cross.platform.screen.and.audio.capture","name":"cross-platform screen and audio capture","description":"Abstracts OS-specific screen capture and audio APIs (macOS: AVFoundation/ScreenCaptureKit, Windows: DXGI/Windows.Media.Capture, Linux: X11/Wayland/PulseAudio) behind a unified interface, enabling the agent to work consistently across platforms. Handles platform-specific permissions, frame rate negotiation, and audio format conversion automatically. Implements fallback mechanisms for unsupported configurations (e.g., Wayland on Linux).","intents":["deploy the agent on macOS, Windows, and Linux without code changes","handle platform-specific permission and capability differences transparently","ensure consistent capture quality and latency across platforms"],"best_for":["cross-platform agent deployments","teams supporting multiple operating systems","developers building portable ambient intelligence systems"],"limitations":["platform-specific APIs have different capabilities — some features unavailable on certain OS (e.g., speaker diarization on Linux)","permission models vary significantly — macOS requires explicit user consent, Windows uses UAC, Linux uses D-Bus","frame rate and resolution vary by platform and hardware — no guarantee of consistent performance","Wayland support on Linux is incomplete — many tools still require X11"],"requires":["macOS 10.15+ (for ScreenCaptureKit) or Windows 10+ (for DXGI) or Linux with X11/Wayland","Platform-specific SDKs (Xcode on macOS, Windows SDK on Windows, libxcb on Linux)","Audio API libraries (AVFoundation on macOS, Windows.Media.Capture on Windows, PulseAudio on Linux)"],"input_types":["capture configuration (resolution, frame rate, audio format)"],"output_types":["screen frames (PNG/JPEG)","audio chunks (WAV/PCM)","capture metadata (timestamp, resolution, frame rate)"],"categories":["image-visual","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hn-47784914__cap_8","uri":"capability://automation.workflow.real.time.performance.monitoring.and.optimization","name":"real-time performance monitoring and optimization","description":"Tracks CPU, memory, and API usage in real-time, implementing adaptive throttling to prevent resource exhaustion. Monitors inference latency and adjusts capture frequency or context window size dynamically to maintain responsiveness. Implements metrics collection (frame processing time, API call latency, token consumption) for debugging and optimization. Provides dashboards or CLI output showing resource usage and performance bottlenecks.","intents":["prevent the agent from consuming excessive resources and degrading system performance","identify performance bottlenecks and optimize the most expensive operations","track API costs and token consumption to manage cloud expenses"],"best_for":["developers optimizing agent performance","teams managing cloud API costs","power users running agents on resource-constrained devices"],"limitations":["monitoring itself adds overhead — typically 5-10% CPU/memory increase","adaptive throttling may reduce recommendation quality if too aggressive","no built-in cost optimization — users must manually adjust parameters based on metrics","metrics collection requires external storage or logging service for long-term analysis"],"requires":["System monitoring libraries (psutil on Python, os.system on shell)","Metrics collection framework (Prometheus, CloudWatch, or custom logging)","Optional: visualization tools (Grafana, custom dashboards)"],"input_types":["system resource snapshots (CPU, memory, disk)","API call logs (latency, tokens, cost)","performance configuration (throttling thresholds)"],"output_types":["performance metrics (JSON or time-series format)","resource usage reports","optimization recommendations"],"categories":["automation-workflow","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hn-47784914__cap_9","uri":"capability://tool.use.integration.extensible.plugin.architecture.for.custom.tools.and.integrations","name":"extensible plugin architecture for custom tools and integrations","description":"Provides a plugin interface that allows developers to register custom tools, integrations, and reasoning modules without modifying core code. Implements a discovery mechanism (likely directory scanning or manifest-based) to load plugins at startup, and a standardized interface (function signature, input/output schema) for plugins to expose capabilities. Supports plugins written in Python or via REST APIs, enabling integration with external services and custom business logic.","intents":["extend the agent with custom tools specific to a team's workflow","integrate with proprietary or internal systems without modifying core code","enable third-party developers to build on top of the agent"],"best_for":["enterprises with custom workflows and integrations","teams building agent ecosystems","developers extending the agent with domain-specific capabilities"],"limitations":["plugin security is user's responsibility — malicious plugins can access screen/audio data","no built-in plugin versioning or dependency management","plugin discovery and loading adds startup latency","no standardized plugin marketplace or distribution mechanism","REST API-based plugins add network latency and require external infrastructure"],"requires":["Python 3.8+ for native plugins","Plugin manifest format (JSON or YAML)","Optional: REST API server for remote plugins","Documentation and examples for plugin developers"],"input_types":["plugin manifest (JSON)","tool request with parameters"],"output_types":["plugin execution result","tool response data","plugin metadata (capabilities, version)"],"categories":["tool-use-integration","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":34,"verified":false,"data_access_risk":"high","permissions":["Python 3.8+","Vision-capable LLM API (OpenAI, Anthropic, or local vision model)","Screen capture permissions (OS-level on macOS/Windows/Linux)","GPU or sufficient CPU for frame processing","Microphone access with OS-level permissions","Speech-to-text API (OpenAI Whisper, Google Cloud Speech-to-Text, or local Whisper model)","Audio processing library (librosa, PyAudio, or similar)","Sufficient bandwidth for streaming audio if using cloud STT","Embedding model (OpenAI, Sentence Transformers, or local) for semantic indexing","In-memory data structure (likely dict/list in Python) or lightweight vector DB (Chroma, Pinecone)"],"failure_modes":["vision model inference latency creates 500ms-2s delay between screen change and detection","high token consumption for continuous frame analysis — may exceed API quotas on free tiers","privacy-sensitive: captures all screen content including passwords, private messages, and confidential data","no built-in redaction or PII filtering — requires external privacy layer","background noise degrades transcription accuracy — typical WER 5-15% in noisy environments","no speaker diarization by default — cannot distinguish who said what in multi-person conversations","continuous audio processing creates significant privacy concerns and regulatory compliance issues (GDPR, CCPA, wiretapping laws)","latency of 2-5 seconds between speech and transcription availability","memory buffer size creates a hard cutoff — events older than the window are lost unless explicitly persisted","no built-in persistence layer — context is lost on agent restart unless external storage is added","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.36,"quality":0.3,"ecosystem":0.46,"match_graph":0.25,"freshness":0.52,"weights":{"adoption":0.25,"quality":0.25,"ecosystem":0.1,"match_graph":0.28,"freshness":0.12}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-06-17T09:51:04.692Z","last_scraped_at":"2026-05-04T08:10:04.759Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=omi-watches-your-screen-hears-conversations-tells-","compare_url":"https://unfragile.ai/compare?artifact=omi-watches-your-screen-hears-conversations-tells-"}},"signature":"PUi0EEagN9bhIAULjfNlg90gE0fgvs2VrhLAzlW2ZQsb/3rSQFsIisGLCcK+YkEhqP4c5Sh25MDi2yknxO9HBg==","signedAt":"2026-06-21T03:21:45.430Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/omi-watches-your-screen-hears-conversations-tells-","artifact":"https://unfragile.ai/omi-watches-your-screen-hears-conversations-tells-","verify":"https://unfragile.ai/api/v1/verify?slug=omi-watches-your-screen-hears-conversations-tells-","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}