{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"openrouter-baidu-ernie-4.5-vl-424b-a47b","slug":"baidu-ernie-4.5-vl-424b-a47b","name":"Baidu: ERNIE 4.5 VL 424B A47B ","type":"model","url":"https://openrouter.ai/models/baidu~ernie-4.5-vl-424b-a47b","page_url":"https://unfragile.ai/baidu-ernie-4.5-vl-424b-a47b","categories":["image-generation"],"tags":["baidu","api-access","text","image"],"pricing":{"model":"paid","free":false,"starting_price":"$4.20e-7 per prompt token"},"status":"active","verified":false},"capabilities":[{"id":"openrouter-baidu-ernie-4.5-vl-424b-a47b__cap_0","uri":"capability://image.visual.multimodal.vision.language.understanding.with.sparse.moe.routing","name":"multimodal vision-language understanding with sparse moe routing","description":"Processes both text and image inputs simultaneously using a 424B parameter Mixture-of-Experts architecture where only 47B parameters activate per token. The model routes different input modalities and semantic contexts through specialized expert sub-networks, enabling efficient joint reasoning across text and visual content without full model activation. This sparse routing pattern reduces computational overhead while maintaining cross-modal coherence through shared embedding spaces and attention mechanisms trained jointly on aligned text-image datasets.","intents":["I need to analyze images with detailed text descriptions and answer questions about visual content","I want to extract structured information from documents that contain both text and images","I need to generate detailed captions or descriptions for images with contextual understanding","I want to perform visual reasoning tasks that require understanding relationships between text and visual elements"],"best_for":["teams building document understanding systems for mixed-media content","developers creating multimodal search or retrieval applications","enterprises processing scanned documents with OCR + semantic understanding","AI product teams needing efficient inference for vision-language tasks at scale"],"limitations":["MoE routing adds latency variance — expert selection overhead ~50-100ms depending on input complexity","Sparse activation means some expert pathways may be undertrained for rare input combinations","Image resolution and aspect ratio handling not specified — may have constraints on input dimensions","No fine-tuning API documented — limited customization for domain-specific vision-language tasks","Requires API access through OpenRouter — no local deployment option available"],"requires":["OpenRouter API key with Baidu model access enabled","HTTP/REST client capability or SDK wrapper (Python, JavaScript, etc.)","Images in standard formats (JPEG, PNG, WebP) — exact supported formats not specified","Text input encoding as UTF-8"],"input_types":["text (natural language queries, descriptions, prompts)","image (JPEG, PNG, WebP — specific resolution limits unknown)","mixed text+image sequences in single request"],"output_types":["text (natural language responses, descriptions, answers)","structured data (JSON-formatted extractions if prompted)","reasoning traces (chain-of-thought explanations)"],"categories":["image-visual","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"openrouter-baidu-ernie-4.5-vl-424b-a47b__cap_1","uri":"capability://image.visual.image.to.text.visual.description.and.captioning","name":"image-to-text visual description and captioning","description":"Generates natural language descriptions, captions, and detailed textual explanations of image content by processing visual features through the model's vision encoder and routing them through language generation experts. The model maps visual regions to semantic tokens and generates coherent multi-sentence descriptions that capture objects, relationships, actions, and scene context. This capability leverages the joint training on image-caption pairs to produce contextually appropriate descriptions at varying levels of detail.","intents":["I need to generate alt-text or accessibility descriptions for images automatically","I want to create detailed captions for images in a content management system","I need to summarize what's happening in an image in natural language","I want to extract a brief summary or long-form description of visual content"],"best_for":["content creators and publishers automating image captioning workflows","accessibility teams generating alt-text at scale for web properties","e-commerce platforms creating product descriptions from images","digital asset management systems indexing visual content with natural language"],"limitations":["Caption length and style not configurable through API — model generates fixed-format descriptions","No control over detail level (brief vs. exhaustive) without prompt engineering","Performance on highly abstract, artistic, or non-photographic images not documented","Batch processing not explicitly supported — requires sequential API calls per image","Hallucination risk for images with ambiguous or novel content — may generate plausible but incorrect details"],"requires":["OpenRouter API key with Baidu ERNIE 4.5 VL access","Image file in supported format (JPEG, PNG, WebP)","Text prompt or instruction to guide caption generation style"],"input_types":["image (JPEG, PNG, WebP)","text (optional prompt specifying caption style, length, or focus)"],"output_types":["text (natural language caption or description, typically 1-5 sentences)"],"categories":["image-visual","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"openrouter-baidu-ernie-4.5-vl-424b-a47b__cap_2","uri":"capability://image.visual.visual.question.answering.with.cross.modal.reasoning","name":"visual question answering with cross-modal reasoning","description":"Answers natural language questions about image content by jointly processing visual features and textual queries through cross-attention mechanisms that bind image regions to question tokens. The model routes question-image pairs through expert networks specialized in visual reasoning, object detection, spatial relationships, and semantic understanding. Responses are generated token-by-token with attention weights distributed across both image patches and question context, enabling reasoning that requires understanding both 'what' is in the image and 'how' it relates to the question.","intents":["I want to ask questions about image content and get accurate answers","I need to verify facts or extract specific information from images","I want to understand relationships, counts, or spatial arrangements in images","I need to perform visual reasoning tasks like 'what would happen if' or 'why' questions"],"best_for":["teams building document Q&A systems over scanned PDFs or images","developers creating visual search or image understanding APIs","enterprises automating inspection or quality control with visual reasoning","educational platforms enabling interactive learning with image-based content"],"limitations":["Reasoning depth limited by context window — complex multi-step visual reasoning may fail","No explicit support for counting large numbers of objects — accuracy degrades beyond ~20 items","Spatial reasoning (left/right, above/below) may be inconsistent for complex scenes","No ability to track object identity across multiple images or temporal sequences","Hallucination risk — may confidently answer questions about details not present in image"],"requires":["OpenRouter API key with Baidu ERNIE 4.5 VL access","Image file (JPEG, PNG, WebP)","Natural language question as text input"],"input_types":["image (JPEG, PNG, WebP)","text (natural language question about image content)"],"output_types":["text (natural language answer, typically 1-3 sentences)","structured data (if prompted to format as JSON or key-value pairs)"],"categories":["image-visual","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"openrouter-baidu-ernie-4.5-vl-424b-a47b__cap_3","uri":"capability://data.processing.analysis.document.understanding.and.information.extraction.from.mixed.media.content","name":"document understanding and information extraction from mixed-media content","description":"Extracts structured information from documents containing both text and images (e.g., scanned PDFs, forms, invoices) by jointly processing visual layout and textual content through specialized extraction experts. The model identifies document structure, locates relevant fields, and extracts values while understanding context from both visual positioning and semantic text content. This capability combines OCR-like visual text recognition with semantic understanding to handle forms, tables, invoices, and complex document layouts where information is conveyed through both text and visual arrangement.","intents":["I need to extract key information from scanned invoices or receipts automatically","I want to parse form data from images or PDFs with mixed text and visual elements","I need to understand table structures and extract data from images of tables","I want to identify and extract specific fields from documents with variable layouts"],"best_for":["financial services teams automating invoice and receipt processing","document management platforms extracting metadata from scanned documents","compliance teams processing regulatory documents with mixed content","RPA platforms augmenting workflow automation with visual document understanding"],"limitations":["No explicit table parsing capability documented — may struggle with complex multi-column layouts","Handwritten text recognition not specified — likely optimized for printed text only","Document rotation and skew handling not documented — may require pre-processing","No support for multi-page document processing in single request — requires page-by-page extraction","Extraction accuracy depends heavily on document quality and layout consistency","No fine-tuning for domain-specific document types (e.g., medical records, legal contracts)"],"requires":["OpenRouter API key with Baidu ERNIE 4.5 VL access","Document image or scanned page (JPEG, PNG, WebP)","Structured prompt specifying fields to extract or extraction format (JSON schema)"],"input_types":["image (scanned document, form, invoice, receipt — JPEG, PNG, WebP)","text (extraction instructions or field specifications)"],"output_types":["structured data (JSON with extracted key-value pairs)","text (natural language extraction results)"],"categories":["data-processing-analysis","image-visual"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"openrouter-baidu-ernie-4.5-vl-424b-a47b__cap_4","uri":"capability://image.visual.image.understanding.with.contextual.text.integration","name":"image understanding with contextual text integration","description":"Analyzes images in the context of accompanying or related text (e.g., image + article text, image + product description) to provide deeper understanding that combines visual and textual context. The model processes image and text inputs jointly, allowing text context to disambiguate visual content and visual content to ground textual claims. This enables tasks like fact-checking images against text, understanding images in narrative context, or enriching image analysis with textual metadata.","intents":["I want to verify if an image matches or contradicts accompanying text or claims","I need to understand an image in the context of an article or description","I want to enrich image analysis with metadata or contextual text information","I need to detect inconsistencies between visual content and textual descriptions"],"best_for":["fact-checking platforms verifying claims against visual evidence","content moderation teams analyzing images with context","e-commerce platforms matching product images to descriptions","news organizations verifying image authenticity and context"],"limitations":["No explicit fact-checking or claim verification mode — requires careful prompt engineering","Context window limits the amount of accompanying text that can be processed","Bias toward text over image or vice versa not documented — may over-weight one modality","No ability to detect manipulated or synthetic images — visual analysis is semantic, not forensic","Hallucination risk when text and image conflict — may generate plausible but false reconciliations"],"requires":["OpenRouter API key with Baidu ERNIE 4.5 VL access","Image file (JPEG, PNG, WebP)","Accompanying text (article excerpt, description, metadata, or claims)"],"input_types":["image (JPEG, PNG, WebP)","text (contextual information, descriptions, claims, or metadata)"],"output_types":["text (analysis, verification results, or contextual understanding)","structured data (JSON with consistency scores or fact-check results if prompted)"],"categories":["image-visual","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":23,"verified":false,"data_access_risk":"low","permissions":["OpenRouter API key with Baidu model access enabled","HTTP/REST client capability or SDK wrapper (Python, JavaScript, etc.)","Images in standard formats (JPEG, PNG, WebP) — exact supported formats not specified","Text input encoding as UTF-8","OpenRouter API key with Baidu ERNIE 4.5 VL access","Image file in supported format (JPEG, PNG, WebP)","Text prompt or instruction to guide caption generation style","Image file (JPEG, PNG, WebP)","Natural language question as text input","Document image or scanned page (JPEG, PNG, WebP)"],"failure_modes":["MoE routing adds latency variance — expert selection overhead ~50-100ms depending on input complexity","Sparse activation means some expert pathways may be undertrained for rare input combinations","Image resolution and aspect ratio handling not specified — may have constraints on input dimensions","No fine-tuning API documented — limited customization for domain-specific vision-language tasks","Requires API access through OpenRouter — no local deployment option available","Caption length and style not configurable through API — model generates fixed-format descriptions","No control over detail level (brief vs. exhaustive) without prompt engineering","Performance on highly abstract, artistic, or non-photographic images not documented","Batch processing not explicitly supported — requires sequential API calls per image","Hallucination risk for images with ambiguous or novel content — may generate plausible but incorrect details","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.05,"quality":0.35,"ecosystem":0.27,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.35,"quality":0.2,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:24.484Z","last_scraped_at":"2026-05-03T15:20:45.776Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=baidu-ernie-4.5-vl-424b-a47b","compare_url":"https://unfragile.ai/compare?artifact=baidu-ernie-4.5-vl-424b-a47b"}},"signature":"W2YJ6jr65XQc7xByEnM2hqRG0WoOkHT2ZPDNlYoU9YmotDacpIpg9qsXpaAQr63djmd1WyIc61UQMbCQzvJ0Cg==","signedAt":"2026-06-20T19:07:53.928Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/baidu-ernie-4.5-vl-424b-a47b","artifact":"https://unfragile.ai/baidu-ernie-4.5-vl-424b-a47b","verify":"https://unfragile.ai/api/v1/verify?slug=baidu-ernie-4.5-vl-424b-a47b","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}