{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"paligemma","slug":"paligemma","name":"PaliGemma","type":"model","url":"https://ai.google.dev/gemma/docs/paligemma","page_url":"https://unfragile.ai/paligemma","categories":["model-training"],"tags":[],"pricing":{"model":"free","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"paligemma__cap_0","uri":"capability://image.visual.fine.grained.optical.character.recognition.with.visual.context","name":"fine-grained optical character recognition with visual context","description":"Extracts and recognizes text from images at multiple resolutions (224×224 to 896×896 pixels) using a SigLIP vision encoder that processes visual features into a token sequence, which is then decoded by the Gemma language model to produce accurate character-level transcriptions. The hybrid architecture enables the model to understand text within its visual context rather than treating OCR as isolated character recognition, improving accuracy on documents with complex layouts, handwriting, or degraded quality.","intents":["extract text from document images, screenshots, or photos for downstream processing","build OCR pipelines that understand document structure and layout context","recognize text in natural images where characters appear at varying scales or angles","transcribe text from images containing multiple languages or mixed scripts"],"best_for":["document processing teams building enterprise OCR systems","developers creating accessibility tools for image-to-text conversion","researchers working on fine-grained visual understanding benchmarks"],"limitations":["Pretrained PT variants require fine-tuning on target OCR tasks before producing reliable results; mix variants are pre-tuned but may not match domain-specific accuracy","Maximum input resolution of 896×896 pixels requires downsampling or tiling for larger documents, potentially losing fine details","No built-in handling of multi-page documents; each image must be processed independently","Context window size unknown, limiting ability to process very long text sequences within single images"],"requires":["Python 3.8+ runtime","JAX framework for fine-tuning (if using PT variants)","GPU with sufficient VRAM for 3B/10B/28B model variants (exact requirements unknown)","Access to Hugging Face or Kaggle model hub for downloading weights"],"input_types":["image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text prompt (optional, for guided OCR or context-aware extraction)"],"output_types":["text (extracted character sequences with preserved formatting intent)"],"categories":["image-visual","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_1","uri":"capability://image.visual.visual.question.answering.with.fine.grained.image.understanding","name":"visual question answering with fine-grained image understanding","description":"Processes natural language questions about image content by encoding the image through SigLIP's vision transformer to extract spatial and semantic features, then feeding both the visual tokens and the question text to Gemma's decoder, which generates natural language answers grounded in specific image regions. The architecture enables answering questions requiring detailed visual reasoning, object relationships, and scene understanding rather than simple image classification.","intents":["build interactive image exploration tools where users ask questions about visual content","create accessibility features that describe image content in response to user queries","develop visual search and retrieval systems that understand semantic relationships in images","implement quality assurance workflows that verify image content matches expected properties"],"best_for":["product teams building image annotation and curation platforms","accessibility engineers creating tools for visually impaired users","e-commerce companies implementing visual search and product discovery","content moderation teams automating image review workflows"],"limitations":["Pretrained PT variants require fine-tuning on VQA datasets before reliable deployment; mix variants are pre-tuned but may not generalize to specialized domains","Answer quality depends on question clarity and image resolution; ambiguous questions may produce hallucinated or incorrect answers","No explicit grounding mechanism to highlight which image regions support the answer, limiting interpretability","Maximum 896×896 input resolution may lose fine details needed for questions about small objects or distant elements"],"requires":["Python 3.8+ runtime","JAX framework for fine-tuning (if using PT variants)","GPU with sufficient VRAM (exact requirements unknown but likely 8GB+ for 3B variant)","Hugging Face or Kaggle model hub access for weight download"],"input_types":["image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text (natural language question about image content)"],"output_types":["text (natural language answer to the visual question)"],"categories":["image-visual","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_10","uri":"capability://automation.workflow.colab.based.interactive.fine.tuning.and.inference.notebooks","name":"colab-based interactive fine-tuning and inference notebooks","description":"Provides Google Colab notebooks that enable interactive fine-tuning and inference without local GPU setup, leveraging Colab's free GPU resources and JAX runtime. Developers can run detection, content generation, and fine-tuning workflows directly in notebooks with minimal setup, enabling rapid prototyping and experimentation without infrastructure investment.","intents":["prototype vision-language models without local GPU infrastructure","fine-tune models on custom datasets using free Colab GPU resources","experiment with different model variants and task configurations interactively","share reproducible notebooks with collaborators for collaborative development"],"best_for":["researchers and students without access to local GPU infrastructure","teams prototyping models before production deployment","educators teaching vision-language models and transfer learning","developers exploring PaliGemma capabilities before committing to production use"],"limitations":["Colab GPU resources are limited and may be preempted; not suitable for long-running training jobs","Colab session timeout after inactivity; requires checkpointing for long fine-tuning runs","No persistent storage; models and datasets must be downloaded each session or stored in Google Drive","Colab GPU type varies (K80, T4, P100); no guarantee of specific hardware","Notebook-based development not suitable for production inference; requires conversion to production code"],"requires":["Google account for Colab access","Internet connection for Colab session","Google Drive access for persistent storage (optional but recommended)","Familiarity with Jupyter notebooks and Python"],"input_types":["image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text (task-specific prompts or questions)","labeled dataset (for fine-tuning)"],"output_types":["text (task-specific outputs: OCR, VQA answers, object descriptions, segmentation descriptions, captions)","model weights (fine-tuned checkpoint)","training metrics (loss curves, validation accuracy)"],"categories":["automation-workflow","code-generation-editing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_2","uri":"capability://image.visual.object.detection.and.localization.with.bounding.box.generation","name":"object detection and localization with bounding box generation","description":"Identifies objects within images and generates their spatial locations by encoding the image through SigLIP to extract region-level visual features, then using Gemma to decode these features into structured text descriptions that include object categories and bounding box coordinates. The approach treats object detection as a text generation problem, enabling flexible output formats and the ability to describe objects using natural language rather than fixed class vocabularies.","intents":["build computer vision pipelines that detect and locate objects without requiring labeled training data for every object class","create inventory management systems that automatically identify and locate items in warehouse or retail images","develop autonomous systems that need to understand object positions for navigation or manipulation tasks","implement visual search systems that find specific objects within images and return their locations"],"best_for":["computer vision engineers building flexible detection systems that adapt to new object classes","robotics teams implementing visual perception for manipulation and navigation","retail and logistics companies automating inventory and shelf-scanning workflows","research teams exploring open-vocabulary object detection approaches"],"limitations":["Pretrained PT variants require fine-tuning on detection datasets; mix variants are pre-tuned but may not handle novel object classes without adaptation","Text-based bounding box output requires parsing and validation; malformed coordinates may occur in edge cases","No explicit confidence scores per detection; answer quality depends on model's implicit confidence in generated text","Maximum 896×896 input resolution limits ability to detect small objects or objects in distant scenes","Detection accuracy depends on question/prompt clarity; vague queries may produce incomplete or incorrect localizations"],"requires":["Python 3.8+ runtime","JAX framework for fine-tuning (if using PT variants)","GPU with sufficient VRAM (exact requirements unknown)","Hugging Face or Kaggle model hub access","Post-processing logic to parse text-based bounding box coordinates into structured format"],"input_types":["image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text prompt (optional, specifying which objects to detect or localize)"],"output_types":["text (object descriptions with bounding box coordinates in text format, requiring parsing)"],"categories":["image-visual","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_3","uri":"capability://image.visual.pixel.level.image.segmentation.with.semantic.understanding","name":"pixel-level image segmentation with semantic understanding","description":"Performs semantic and instance segmentation by encoding images through SigLIP's spatial feature extraction, then using Gemma to generate segmentation masks or semantic descriptions of pixel-level regions. The vision-language approach enables segmentation that understands semantic meaning of regions rather than treating segmentation as purely geometric pixel clustering, allowing the model to segment based on object categories, materials, or semantic concepts.","intents":["build image editing tools that enable semantic selection of image regions for manipulation","create medical imaging analysis systems that segment anatomical structures or pathologies","develop agricultural monitoring systems that segment crop types, soil conditions, or disease areas","implement scene understanding pipelines that parse images into semantically meaningful regions"],"best_for":["medical imaging teams automating anatomical segmentation and pathology detection","agricultural technology companies monitoring crop health and field conditions","image editing software developers implementing intelligent selection and masking","autonomous systems teams building scene understanding for navigation and planning"],"limitations":["Pretrained PT variants require fine-tuning on segmentation datasets; mix variants are pre-tuned but may not generalize to specialized domains like medical imaging","Output format (mask generation vs. text description) depends on fine-tuning approach; no standardized segmentation output format documented","Maximum 896×896 input resolution limits ability to segment fine details or large-scale scenes","No explicit confidence maps per pixel; segmentation quality depends on model's implicit confidence in generated masks","Semantic segmentation quality depends on whether model was fine-tuned on target semantic categories"],"requires":["Python 3.8+ runtime","JAX framework for fine-tuning (if using PT variants)","GPU with sufficient VRAM (exact requirements unknown)","Hugging Face or Kaggle model hub access","Post-processing logic to convert text-based or probabilistic outputs into binary or multi-class segmentation masks"],"input_types":["image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text prompt (optional, specifying which regions or semantic categories to segment)"],"output_types":["segmentation mask (binary or multi-class, format depends on fine-tuning approach)","text (semantic descriptions of segmented regions)"],"categories":["image-visual","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_4","uri":"capability://image.visual.image.captioning.and.visual.content.description","name":"image captioning and visual content description","description":"Generates natural language descriptions of image content by encoding images through SigLIP's vision transformer to extract comprehensive visual features, then decoding these features through Gemma's language model to produce fluent, contextually appropriate captions. The architecture enables generating captions of varying length and detail level, from short single-sentence descriptions to longer paragraph-length summaries, and can be fine-tuned to match specific caption styles or domains.","intents":["generate alt-text and accessibility descriptions for images in web applications and documents","create metadata and searchable descriptions for image databases and digital asset management systems","build social media content generation tools that automatically caption user-uploaded images","implement video understanding systems that generate frame-by-frame or scene-level descriptions"],"best_for":["web accessibility teams automating alt-text generation for compliance","digital asset management companies enabling searchable image libraries","social media platforms automating content description and discovery","video analysis teams generating temporal descriptions of video content"],"limitations":["Pretrained PT variants require fine-tuning on captioning datasets; mix variants are pre-tuned but may produce generic captions without domain-specific fine-tuning","Caption quality and length depend on fine-tuning data and prompt engineering; no explicit control over caption length or style without additional prompting","Maximum 896×896 input resolution may lose fine details that would improve caption accuracy","No explicit grounding mechanism to indicate which image regions support specific caption phrases","Captions may contain hallucinations or inaccuracies, especially for uncommon objects or scenes"],"requires":["Python 3.8+ runtime","JAX framework for fine-tuning (if using PT variants)","GPU with sufficient VRAM (exact requirements unknown)","Hugging Face or Kaggle model hub access","Optional: caption evaluation metrics (BLEU, CIDEr, METEOR) for quality assessment"],"input_types":["image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text prompt (optional, to guide caption style, length, or focus)"],"output_types":["text (natural language caption describing image content)"],"categories":["image-visual","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_5","uri":"capability://code.generation.editing.task.specific.fine.tuning.with.jax.framework","name":"task-specific fine-tuning with jax framework","description":"Enables adaptation of pretrained PaliGemma models to specific tasks (OCR, VQA, detection, segmentation, captioning) through supervised fine-tuning using JAX, which provides efficient gradient computation and distributed training across multiple GPUs. The fine-tuning process updates model weights on task-specific datasets, allowing the base architecture to specialize for improved accuracy on target domains while maintaining the hybrid SigLIP+Gemma architecture.","intents":["adapt pretrained models to domain-specific tasks like medical image analysis or specialized OCR","improve model accuracy on target datasets by fine-tuning on labeled examples","create specialized model variants for different use cases without training from scratch","optimize inference latency and accuracy tradeoffs by fine-tuning smaller model variants"],"best_for":["machine learning teams with labeled datasets for specific vision-language tasks","researchers exploring transfer learning and domain adaptation approaches","companies deploying models to specialized domains (medical, legal, manufacturing)","teams optimizing model size/accuracy tradeoffs for resource-constrained deployments"],"limitations":["Requires labeled training data for target task; no unsupervised or self-supervised fine-tuning approaches documented","JAX framework has steeper learning curve than PyTorch or TensorFlow; requires familiarity with functional programming patterns","Fine-tuning hyperparameters (learning rate, batch size, epochs) not documented; requires experimentation","No built-in evaluation metrics or validation frameworks; teams must implement custom evaluation logic","Computational cost of fine-tuning unknown; likely requires GPU resources for reasonable training time","No documentation of fine-tuning convergence behavior or typical accuracy improvements"],"requires":["Python 3.8+ runtime","JAX 0.3.0+ with CUDA/GPU support","Labeled dataset for target task (format and size requirements unknown)","GPU with sufficient VRAM for gradient computation (exact requirements unknown)","Hugging Face or Kaggle model hub access for downloading base weights","Colab notebook environment (provided by Google) or local JAX setup"],"input_types":["image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text (task-specific labels or annotations)","structured data (task-specific metadata or ground truth)"],"output_types":["model weights (fine-tuned PaliGemma checkpoint)","training metrics (loss curves, validation accuracy)"],"categories":["code-generation-editing","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_6","uri":"capability://image.visual.multi.resolution.image.encoding.with.variable.input.sizes","name":"multi-resolution image encoding with variable input sizes","description":"Processes images at three standardized resolutions (224×224, 448×448, 896×896 pixels) through SigLIP's vision transformer, which extracts visual features at the appropriate scale for the input resolution. This enables flexible input handling where higher resolutions capture finer details at the cost of increased computation, while lower resolutions enable faster inference with reduced memory requirements, allowing developers to optimize for latency or accuracy depending on application requirements.","intents":["optimize inference latency by using lower resolutions for real-time applications","maximize accuracy on detail-sensitive tasks by using higher resolutions","handle variable-sized input images by resizing to supported resolutions","implement adaptive resolution selection based on image content or computational budget"],"best_for":["real-time vision applications requiring sub-second inference latency","detail-sensitive tasks like medical imaging or document OCR requiring high resolution","mobile and edge deployment scenarios with limited computational resources","systems processing heterogeneous image sources with varying native resolutions"],"limitations":["Only three discrete resolutions supported (224×224, 448×448, 896×896); no continuous scaling","Downsampling larger images to supported resolutions loses fine details; upsampling smaller images may introduce artifacts","No documented guidance on resolution selection for specific tasks; requires empirical evaluation","Inference latency and memory requirements for each resolution not documented","No automatic resolution selection based on image content or computational budget"],"requires":["Image preprocessing pipeline to resize inputs to one of three supported resolutions","GPU with sufficient VRAM for target resolution (exact requirements unknown)","Knowledge of task-specific accuracy/latency tradeoffs to select appropriate resolution"],"input_types":["image (JPEG, PNG, WebP at any resolution; must be resized to 224×224, 448×448, or 896×896)"],"output_types":["visual features (encoded by SigLIP, consumed by Gemma decoder)"],"categories":["image-visual","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_7","uri":"capability://image.visual.pretrained.model.variants.with.task.specific.tuning","name":"pretrained model variants with task-specific tuning","description":"Provides three model variants optimized for different deployment scenarios: PaliGemma PT (pretrained, requires fine-tuning), PaliGemma FT (research-oriented, task-specific fine-tuning), and PaliGemma mix (multi-task mixture, ready for immediate use). Each variant represents a different point on the spectrum between generality and task-specificity, enabling developers to choose based on whether they have labeled data for fine-tuning or need immediate deployment.","intents":["deploy models immediately without fine-tuning using pre-tuned mix variants","fine-tune models on custom datasets using PT variants as initialization","access research-grade task-specific models for benchmarking and evaluation","choose between immediate deployment and accuracy optimization based on project timeline"],"best_for":["teams with immediate deployment needs who want to use pre-tuned models without fine-tuning","researchers exploring transfer learning and fine-tuning approaches","companies with labeled datasets who can invest in fine-tuning for domain-specific accuracy","projects with flexible timelines that can accommodate fine-tuning development cycles"],"limitations":["PT variants explicitly require fine-tuning before producing useful results; no zero-shot capability documented","Mix variants are pre-tuned but may not match domain-specific accuracy of fine-tuned PT variants","FT variants are research-oriented; no documentation of production readiness or support","No guidance on which variant to choose for specific tasks or domains","No documented accuracy comparisons between variants on standard benchmarks"],"requires":["Hugging Face or Kaggle model hub access to download variant weights","JAX framework for fine-tuning (if using PT variants)","GPU with sufficient VRAM for target variant (3B, 10B, or 28B parameter counts)"],"input_types":["image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text (task-specific prompts or questions)"],"output_types":["text (task-specific outputs: OCR, VQA answers, object descriptions, segmentation descriptions, captions)"],"categories":["image-visual","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_8","uri":"capability://image.visual.multimodal.input.fusion.with.vision.language.alignment","name":"multimodal input fusion with vision-language alignment","description":"Processes simultaneous image and text inputs by encoding the image through SigLIP to extract visual tokens and concatenating them with text embeddings from Gemma's tokenizer, then feeding the combined sequence to Gemma's decoder. This alignment approach enables the model to understand relationships between visual content and natural language queries, enabling tasks that require reasoning about both modalities simultaneously rather than treating them independently.","intents":["answer questions about images that require understanding both visual content and question semantics","perform image search and retrieval based on natural language queries","generate image descriptions conditioned on specific aspects mentioned in text prompts","implement visual reasoning tasks that require combining visual and linguistic information"],"best_for":["visual search and retrieval systems that understand semantic relationships between images and queries","interactive image exploration tools that answer user questions about visual content","content generation systems that create descriptions conditioned on specific aspects","accessibility tools that provide detailed descriptions of images in response to user queries"],"limitations":["Alignment quality depends on training data; no documentation of alignment approach or training objectives","Text input length limits unknown; may constrain complexity of questions or prompts","No explicit mechanism to weight visual vs. textual information; balance depends on model training","Multimodal hallucinations possible; model may generate plausible-sounding but incorrect descriptions","No documentation of how model handles conflicting or ambiguous visual-linguistic information"],"requires":["Image input at one of three supported resolutions (224×224, 448×448, 896×896)","Text input (natural language question or prompt)","GPU with sufficient VRAM for joint encoding and decoding"],"input_types":["image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text (natural language question, prompt, or context)"],"output_types":["text (answer, description, or reasoning grounded in both visual and textual inputs)"],"categories":["image-visual","text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__cap_9","uri":"capability://tool.use.integration.open.source.model.distribution.via.hugging.face.and.kaggle","name":"open-source model distribution via hugging face and kaggle","description":"Distributes PaliGemma model weights and code through Hugging Face Model Hub and Kaggle Datasets, enabling open-source access without API keys or cloud infrastructure requirements. Developers can download model weights directly, integrate them into custom inference pipelines, and deploy locally or on their own infrastructure, enabling full control over inference, fine-tuning, and deployment without vendor lock-in.","intents":["download and deploy models locally without relying on cloud APIs or vendor infrastructure","integrate models into custom inference pipelines with full control over preprocessing and postprocessing","fine-tune models on private datasets without sending data to external services","build applications with guaranteed model availability and no API rate limits or costs"],"best_for":["teams with privacy requirements who cannot send images to cloud APIs","developers building on-device or edge inference systems","researchers exploring model internals and implementing custom modifications","companies avoiding vendor lock-in and API dependency costs"],"limitations":["Requires local GPU infrastructure for inference; no free cloud inference endpoint provided","Model weights are large (3B, 10B, 28B parameters); downloading and storing locally requires significant disk space","No official API or SDK; developers must implement custom inference code or use community tools","No official support or SLA; community-driven support only","Deployment and scaling responsibility falls on developer; no managed infrastructure"],"requires":["Hugging Face or Kaggle account for model hub access","Python 3.8+ runtime","GPU with sufficient VRAM for target model variant (exact requirements unknown)","Disk space for model weights (likely 6GB+ for 3B variant, 20GB+ for 10B, 56GB+ for 28B)","Custom inference code or community tools (e.g., Ollama, vLLM) for running models"],"input_types":["model weights (downloaded from Hugging Face or Kaggle)","image (JPEG, PNG, WebP at 224×224, 448×448, or 896×896 resolution)","text (task-specific prompts or questions)"],"output_types":["text (task-specific outputs: OCR, VQA answers, object descriptions, segmentation descriptions, captions)"],"categories":["tool-use-integration","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"paligemma__headline","uri":"capability://image.visual.vision.language.model.for.fine.grained.visual.understanding","name":"vision-language model for fine-grained visual understanding","description":"PaliGemma is a cutting-edge vision-language model that excels in tasks like OCR, visual QA, object detection, and image segmentation, making it ideal for developers seeking advanced visual understanding capabilities.","intents":["best vision-language model","vision-language model for OCR tasks","vision-language model for image segmentation","top models for visual question answering","AI models for object detection"],"best_for":[],"limitations":[],"requires":[],"input_types":["images","text"],"output_types":[],"categories":["image-visual"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":57,"verified":false,"data_access_risk":"high","permissions":["Python 3.8+ runtime","JAX framework for fine-tuning (if using PT variants)","GPU with sufficient VRAM for 3B/10B/28B model variants (exact requirements unknown)","Access to Hugging Face or Kaggle model hub for downloading weights","GPU with sufficient VRAM (exact requirements unknown but likely 8GB+ for 3B variant)","Hugging Face or Kaggle model hub access for weight download","Google account for Colab access","Internet connection for Colab session","Google Drive access for persistent storage (optional but recommended)","Familiarity with Jupyter notebooks and Python"],"failure_modes":["Pretrained PT variants require fine-tuning on target OCR tasks before producing reliable results; mix variants are pre-tuned but may not match domain-specific accuracy","Maximum input resolution of 896×896 pixels requires downsampling or tiling for larger documents, potentially losing fine details","No built-in handling of multi-page documents; each image must be processed independently","Context window size unknown, limiting ability to process very long text sequences within single images","Pretrained PT variants require fine-tuning on VQA datasets before reliable deployment; mix variants are pre-tuned but may not generalize to specialized domains","Answer quality depends on question clarity and image resolution; ambiguous questions may produce hallucinated or incorrect answers","No explicit grounding mechanism to highlight which image regions support the answer, limiting interpretability","Maximum 896×896 input resolution may lose fine details needed for questions about small objects or distant elements","Colab GPU resources are limited and may be preempted; not suitable for long-running training jobs","Colab session timeout after inactivity; requires checkpointing for long fine-tuning runs","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7,"quality":0.9,"ecosystem":0.3,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.35,"quality":0.2,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:25.060Z","last_scraped_at":null,"last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=paligemma","compare_url":"https://unfragile.ai/compare?artifact=paligemma"}},"signature":"YlhHw0N66CIxAparkIIhD4N9pSIxVoHrraKK00YBTOYdzaG0q7Fgm5r/oEVokUjJhZxrRAysn1jdk5KQXniYDQ==","signedAt":"2026-06-22T05:30:35.507Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/paligemma","artifact":"https://unfragile.ai/paligemma","verify":"https://unfragile.ai/api/v1/verify?slug=paligemma","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}