{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic","slug":"facebook--mask2former-swin-large-cityscapes-semantic","name":"mask2former-swin-large-cityscapes-semantic","type":"model","url":"https://huggingface.co/facebook/mask2former-swin-large-cityscapes-semantic","page_url":"https://unfragile.ai/facebook--mask2former-swin-large-cityscapes-semantic","categories":["image-generation"],"tags":["transformers","pytorch","safetensors","mask2former","vision","image-segmentation","dataset:coco","arxiv:2112.01527","arxiv:2107.06278","license:other","endpoints_compatible","deploy:azure","region:us"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_0","uri":"capability://image.visual.panoptic.semantic.segmentation.with.transformer.backbone","name":"panoptic-semantic segmentation with transformer backbone","description":"Performs pixel-level semantic segmentation on images using a Swin Transformer large backbone combined with Mask2Former architecture. The model uses a masked attention mechanism and deformable cross-attention to process multi-scale features, enabling it to classify each pixel into one of 19 Cityscapes semantic classes (road, sidewalk, building, etc.). The architecture processes images through hierarchical vision transformer blocks that capture both local and global context before feeding into the segmentation head.","intents":["segment urban street scenes into semantic categories for autonomous driving perception pipelines","extract road infrastructure (lanes, sidewalks, buildings) from dashcam or satellite imagery","prepare pixel-level annotations for downstream computer vision tasks in urban environments","benchmark semantic segmentation performance on Cityscapes-domain images"],"best_for":["autonomous vehicle teams building perception stacks for urban environments","computer vision researchers evaluating state-of-the-art segmentation architectures","teams deploying edge inference on Cityscapes-domain street-level imagery"],"limitations":["Model trained exclusively on Cityscapes dataset (European urban streets) — performance degrades significantly on non-urban or geographically different scenes","Requires GPU memory ~11GB for inference on full-resolution images due to Swin-Large backbone size","Inference latency ~200-400ms per image on V100 GPU — not suitable for real-time 30+ FPS applications without optimization","Only supports 19 semantic classes from Cityscapes taxonomy — cannot be directly applied to other domain-specific segmentation tasks without fine-tuning","No built-in batch processing optimization — sequential inference required for multiple images"],"requires":["PyTorch 1.9+","transformers library 4.25+","CUDA 11.0+ for GPU inference (CPU inference possible but ~10x slower)","Minimum 12GB VRAM for batch size 1 at 1024x2048 resolution","PIL/Pillow for image loading and preprocessing"],"input_types":["RGB images (3-channel, uint8 or float32)","Variable resolution (model handles dynamic input sizes via padding/resizing)","Formats: JPEG, PNG, BMP via PIL"],"output_types":["segmentation maps (HxW integer tensor with class indices 0-18)","per-pixel class logits (HxWx19 float tensor before argmax)","optional: class probability maps via softmax"],"categories":["image-visual","semantic-segmentation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_1","uri":"capability://image.visual.multi.scale.feature.extraction.via.hierarchical.vision.transformer","name":"multi-scale feature extraction via hierarchical vision transformer","description":"Extracts hierarchical feature pyramids from input images using Swin Transformer's shifted-window attention blocks across 4 stages (C2, C3, C4, C5 in ResNet nomenclature). Each stage progressively reduces spatial resolution while increasing channel depth, with shifted-window attention enabling linear complexity scaling. Features are then fused via lateral connections and upsampling before feeding into the segmentation decoder, allowing the model to capture both fine-grained details and semantic context.","intents":["extract multi-resolution feature representations for downstream segmentation heads","enable efficient processing of high-resolution images through hierarchical downsampling","capture both local texture and global semantic context in a single forward pass"],"best_for":["researchers studying vision transformer efficiency vs CNN-based feature extraction","teams optimizing inference latency through feature-level pruning or quantization"],"limitations":["Shifted-window attention requires image dimensions divisible by 32 — smaller images may need padding that affects edge predictions","Feature extraction is not independently accessible — requires loading full model weights even if only intermediate features are needed","Memory consumption scales quadratically with image resolution due to attention computation in later stages"],"requires":["PyTorch 1.9+","transformers library with Swin implementation"],"input_types":["RGB images (3-channel)"],"output_types":["4-level feature pyramid (C2-C5 with channels 96, 192, 384, 768)"],"categories":["image-visual","feature-extraction"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_10","uri":"capability://image.visual.fine.tuning.on.custom.semantic.segmentation.datasets","name":"fine-tuning on custom semantic segmentation datasets","description":"Supports transfer learning by fine-tuning the pre-trained Cityscapes model on custom semantic segmentation datasets. The model's backbone and decoder weights are initialized from Cityscapes pre-training, and only the final classification layer is retrained for custom class taxonomies. Fine-tuning requires annotated images with per-pixel class labels in the same format as Cityscapes (PNG masks with class indices). Training uses standard PyTorch optimizers (AdamW) and learning rate schedules (cosine annealing).","intents":["adapt model to custom domains (e.g., different cities, weather conditions, camera types) with limited labeled data","train on custom class taxonomies (e.g., 5 classes instead of 19) without retraining from scratch","improve model performance on domain-specific data through transfer learning"],"best_for":["teams with limited labeled data for custom segmentation tasks","practitioners adapting Cityscapes model to different domains","researchers studying transfer learning in semantic segmentation"],"limitations":["Fine-tuning requires pixel-level annotations — significantly more expensive than image-level labels","Transfer learning effectiveness depends on domain similarity to Cityscapes — very different domains may require more training data","Fine-tuning on small datasets (<1000 images) may lead to overfitting despite pre-training","No built-in domain adaptation techniques (e.g., style transfer, self-supervised learning) — requires manual implementation","Fine-tuning requires GPU and deep learning expertise — not accessible to non-technical users"],"requires":["PyTorch 1.9+","transformers library 4.25+","Annotated dataset with per-pixel class labels (PNG masks)","GPU with 12GB+ VRAM","Training code (not provided in model card — requires custom implementation or use of third-party libraries)"],"input_types":["RGB images and corresponding PNG segmentation masks"],"output_types":["fine-tuned model checkpoint"],"categories":["image-visual","transfer-learning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_11","uri":"capability://image.visual.deployment.on.cloud.platforms.with.huggingface.inference.api","name":"deployment on cloud platforms with huggingface inference api","description":"Model is compatible with HuggingFace's managed Inference API, enabling serverless deployment without infrastructure management. Users can call the model via REST API endpoints hosted on HuggingFace servers, with automatic scaling and GPU allocation. The API handles model loading, inference, and response formatting, returning segmentation maps as base64-encoded images or JSON arrays.","intents":["deploy model without managing servers or GPUs","integrate segmentation into web applications via REST API","scale inference automatically based on demand"],"best_for":["teams without infrastructure expertise","web applications requiring on-demand segmentation","prototypes and MVPs prioritizing time-to-market over cost"],"limitations":["API latency is 500ms-2s per image due to network overhead and cold start — unsuitable for real-time applications","Pricing scales with API calls — high-volume applications may be expensive vs self-hosted inference","Limited customization — cannot modify preprocessing or postprocessing logic","API rate limits may throttle high-throughput applications","Data privacy concerns — images are sent to HuggingFace servers"],"requires":["HuggingFace account","API key for authentication","HTTP client library (requests, curl, etc.)"],"input_types":["image files (JPEG, PNG) or base64-encoded image data"],"output_types":["segmentation map as base64-encoded image or JSON array"],"categories":["image-visual","cloud-deployment"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_12","uri":"capability://image.visual.model.quantization.for.edge.deployment","name":"model quantization for edge deployment","description":"Supports post-training quantization to int8 precision using PyTorch's quantization APIs, reducing model size from ~500MB to ~125MB and enabling deployment on edge devices with limited storage. Quantization converts float32 weights and activations to int8, reducing memory bandwidth and enabling faster inference on specialized hardware (e.g., Qualcomm Snapdragon). Quantization-aware training is not performed, so accuracy may degrade by 1-2% on minority classes.","intents":["deploy model on edge devices with limited storage (e.g., mobile phones, embedded systems)","reduce model size for faster download and deployment","enable inference on specialized hardware with int8 support"],"best_for":["mobile and edge deployment on iOS, Android, embedded Linux","teams prioritizing model size and inference speed over accuracy","resource-constrained environments with limited storage and memory"],"limitations":["Quantization causes 1-2% mIoU accuracy loss on minority classes due to reduced precision","Quantization requires calibration on representative data — poor calibration data leads to larger accuracy loss","Quantized models are not easily fine-tunable — require re-quantization after retraining","Quantization may not be supported for all operations (e.g., deformable attention) — requires custom quantization logic","Quantized inference requires specialized hardware or software support — not all inference engines support int8"],"requires":["PyTorch 1.9+","Calibration dataset (representative images for quantization calibration)","Inference engine with int8 support (TensorRT, ONNX Runtime, TFLite)"],"input_types":["float32 model checkpoint"],"output_types":["int8 quantized model (~125MB)"],"categories":["image-visual","model-optimization"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_2","uri":"capability://image.visual.masked.attention.based.segmentation.head.with.deformable.cross.attention","name":"masked attention-based segmentation head with deformable cross-attention","description":"Decodes multi-scale features into per-pixel class predictions using Mask2Former's masked attention mechanism, which operates on a learned set of class queries (19 for Cityscapes). The decoder uses deformable cross-attention to dynamically focus on relevant spatial regions rather than attending uniformly across the feature map, reducing computational cost and improving localization. Queries are iteratively refined through multiple decoder layers, with each layer predicting both class logits and binary masks that gate attention in subsequent layers.","intents":["convert multi-scale feature pyramids into pixel-level semantic predictions","focus model attention on relevant image regions to improve segmentation accuracy","enable efficient decoding through learned query-based attention rather than dense convolutions"],"best_for":["teams deploying segmentation models where inference latency is critical","researchers studying query-based vs dense prediction paradigms in vision"],"limitations":["Deformable attention adds ~50-100ms latency per image compared to standard convolution-based decoders","Query-based approach may struggle with very small objects (<1% image area) due to limited query capacity","Requires careful initialization of class queries — poor initialization can lead to mode collapse where multiple queries predict the same class"],"requires":["PyTorch 1.9+","CUDA-compatible GPU for deformable attention CUDA kernels"],"input_types":["multi-scale feature pyramids (4 levels from backbone)"],"output_types":["class logits per pixel (HxWx19)","binary segmentation masks per query (19xHxW)"],"categories":["image-visual","segmentation-decoding"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_3","uri":"capability://image.visual.cityscapes.domain.semantic.class.prediction.with.19.class.taxonomy","name":"cityscapes-domain semantic class prediction with 19-class taxonomy","description":"Predicts one of 19 semantic classes for each pixel, including road, sidewalk, building, wall, fence, pole, traffic light, traffic sign, vegetation, terrain, sky, person, rider, car, truck, bus, train, motorcycle, and bicycle. The model outputs per-pixel class logits that are converted to class indices via argmax. Class distribution is heavily imbalanced (road/building dominate), which the training process addresses through weighted cross-entropy loss, but this imbalance persists in inference predictions.","intents":["classify each pixel in urban street scenes into semantic categories for scene understanding","generate pixel-level ground truth for training downstream perception models","evaluate autonomous driving perception systems on standard Cityscapes benchmark"],"best_for":["autonomous driving teams working with Cityscapes-annotated datasets","researchers benchmarking on standard urban scene understanding tasks"],"limitations":["Only supports 19 Cityscapes classes — cannot predict custom classes without retraining","Severe class imbalance in predictions (road/building predictions dominate) — requires post-processing for balanced class representation","Performance drops significantly on non-Cityscapes domains (e.g., different cities, weather conditions, camera angles)","No uncertainty quantification — model outputs hard class predictions without confidence scores"],"requires":["Input images from Cityscapes-like urban street scenes for optimal performance","Post-processing to map 19 classes to application-specific taxonomy if needed"],"input_types":["RGB images of urban street scenes"],"output_types":["integer tensor (HxW) with values 0-18 representing class indices","optional: float tensor (HxWx19) with per-class logits before argmax"],"categories":["image-visual","classification"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_4","uri":"capability://image.visual.variable.resolution.image.processing.with.dynamic.padding","name":"variable-resolution image processing with dynamic padding","description":"Accepts images of arbitrary resolution and automatically pads them to multiples of 32 (required by Swin Transformer's shifted-window attention) before processing. The model internally resizes or pads input images to a standard size (typically 1024x2048 for Cityscapes resolution) while preserving aspect ratio through letterboxing. Output segmentation maps are then cropped back to original input dimensions, enabling inference on images of any size without retraining.","intents":["process images from different cameras or sources with varying resolutions without preprocessing","handle both high-resolution (4K) and low-resolution (VGA) inputs in the same pipeline","avoid retraining or fine-tuning for different input resolutions"],"best_for":["production systems handling heterogeneous image sources","teams deploying on edge devices with variable input resolutions"],"limitations":["Padding adds computational overhead (~5-10% latency increase) and memory consumption","Segmentation quality degrades for images significantly smaller than training resolution (1024x2048) due to information loss during downsampling","Very large images (>4K) may exceed GPU memory — requires tiling or resolution reduction","Aspect ratio preservation via letterboxing creates black padding regions that may be misclassified as background"],"requires":["Input images with aspect ratio between 0.5 and 2.0 (extreme aspect ratios may require custom preprocessing)"],"input_types":["RGB images of any resolution"],"output_types":["segmentation maps matching input image dimensions"],"categories":["image-visual","preprocessing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_5","uri":"capability://image.visual.batch.inference.with.configurable.batch.size","name":"batch inference with configurable batch size","description":"Supports processing multiple images in a single forward pass by stacking them into batches, reducing per-image overhead and improving GPU utilization. Batch size is configurable based on available GPU memory (typical range: 1-8 for V100 at 1024x2048 resolution). The model processes all images in parallel through the transformer backbone and decoder, with output segmentation maps returned as a batch tensor.","intents":["process multiple images efficiently in production pipelines","maximize GPU utilization for throughput-critical applications","reduce per-image latency through amortized overhead"],"best_for":["batch processing pipelines (e.g., processing video frames offline)","teams optimizing inference throughput on fixed hardware"],"limitations":["Batch processing requires all images to be padded to same dimensions — heterogeneous resolutions require padding to largest image size, wasting computation","Memory consumption scales linearly with batch size — large batches may exceed GPU memory","No built-in batching utilities in HuggingFace transformers library — requires manual tensor stacking and unstacking","Batch size must be fixed at inference time — dynamic batching not supported"],"requires":["GPU with sufficient memory for batch size × image resolution","Manual tensor manipulation for batching/unbatching"],"input_types":["batch of RGB images (BxCxHxW tensor)"],"output_types":["batch of segmentation maps (BxHxW integer tensor)"],"categories":["image-visual","batch-processing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_6","uri":"capability://image.visual.model.export.to.onnx.and.torchscript.formats","name":"model export to onnx and torchscript formats","description":"Exports the trained model to ONNX (Open Neural Network Exchange) and TorchScript formats for deployment in non-PyTorch environments (e.g., C++, mobile, ONNX Runtime). The export process traces or scripts the model's forward pass, converting PyTorch operations to framework-agnostic representations. ONNX export enables deployment on CPUs, mobile devices, and specialized inference engines (TensorRT, CoreML), while TorchScript enables C++ deployment without Python dependency.","intents":["deploy model in production environments without PyTorch dependency","run inference on mobile devices or edge hardware with ONNX Runtime","integrate model into C++ applications for autonomous driving systems"],"best_for":["teams deploying to production servers without PyTorch","mobile/edge deployment on iOS, Android, or embedded Linux","C++ integration for real-time autonomous driving systems"],"limitations":["ONNX export may lose some PyTorch-specific operations — deformable attention kernels may not export cleanly, requiring custom ONNX operators","Exported models are not easily fine-tunable — require re-export after retraining","ONNX Runtime inference may be 10-20% slower than PyTorch due to operator fusion differences","TorchScript export requires tracing, which may fail for models with dynamic control flow","No built-in quantization during export — requires separate quantization step for mobile deployment"],"requires":["PyTorch 1.9+","ONNX opset 13+ for deformable attention support","ONNX Runtime or TensorRT for inference"],"input_types":["PyTorch model checkpoint"],"output_types":["ONNX model file (.onnx)","TorchScript model file (.pt)"],"categories":["image-visual","model-export"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_7","uri":"capability://image.visual.inference.on.cpu.with.reduced.precision","name":"inference on cpu with reduced precision","description":"Supports inference on CPU hardware using reduced precision (float16, int8) through PyTorch's quantization and mixed-precision APIs. CPU inference is ~10-20x slower than GPU but enables deployment on servers without NVIDIA GPUs. Mixed-precision inference (float16 on GPU, float32 on CPU) reduces memory consumption by ~50% at cost of slight accuracy degradation (<0.5% mIoU loss).","intents":["deploy model on CPU-only servers for cost reduction","run inference on laptops or edge devices without GPU","reduce memory consumption for deployment on memory-constrained hardware"],"best_for":["cost-sensitive deployments where GPU is unavailable","edge devices with limited memory (e.g., Raspberry Pi, Jetson Nano)","teams prioritizing inference cost over latency"],"limitations":["CPU inference is 10-20x slower than GPU (2-4 seconds per image vs 200-400ms) — unsuitable for real-time applications","Quantization to int8 may cause 1-2% mIoU accuracy loss on minority classes","Mixed-precision inference requires careful handling of numerical stability — some operations may overflow in float16","No built-in quantization-aware training — requires post-training quantization which may be suboptimal"],"requires":["PyTorch 1.9+","CPU with AVX2 support for optimized operations","Optional: Intel MKL for optimized BLAS operations"],"input_types":["RGB images"],"output_types":["segmentation maps (may have slight accuracy degradation vs float32)"],"categories":["image-visual","inference-optimization"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_8","uri":"capability://image.visual.integration.with.huggingface.transformers.pipeline.api","name":"integration with huggingface transformers pipeline api","description":"Integrates with HuggingFace's high-level pipeline API, enabling one-line inference without manual model loading or preprocessing. The pipeline handles image loading, resizing, normalization, and output post-processing automatically. Users can instantiate a segmentation pipeline with a single function call and process images with `.predict()` method, abstracting away PyTorch complexity.","intents":["quickly prototype segmentation applications without deep PyTorch knowledge","integrate model into existing HuggingFace-based workflows","reduce boilerplate code for inference"],"best_for":["non-expert users prototyping segmentation applications","teams already using HuggingFace transformers ecosystem","rapid prototyping and proof-of-concept development"],"limitations":["Pipeline API abstracts away control over preprocessing — difficult to customize normalization or resizing behavior","No direct access to intermediate features or logits — only final class predictions available","Pipeline API adds ~50-100ms overhead per image due to abstraction layers","Batch processing through pipeline API is less efficient than direct model calls","Limited debugging visibility — errors in preprocessing are hidden behind pipeline abstraction"],"requires":["transformers library 4.25+","PyTorch 1.9+"],"input_types":["image file paths, PIL images, or numpy arrays"],"output_types":["segmentation map as PIL image or numpy array"],"categories":["image-visual","api-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-facebook--mask2former-swin-large-cityscapes-semantic__cap_9","uri":"capability://image.visual.model.card.documentation.with.benchmark.metrics","name":"model card documentation with benchmark metrics","description":"Provides comprehensive model documentation including training dataset details, benchmark metrics on Cityscapes validation set (82.0 mIoU), per-class IoU scores, inference latency benchmarks on different hardware (V100, A100, CPU), and usage examples. Documentation includes limitations, ethical considerations, and recommendations for fine-tuning on custom datasets.","intents":["evaluate model suitability for specific applications based on published metrics","understand model limitations and failure modes before deployment","estimate inference latency and hardware requirements for production planning"],"best_for":["teams evaluating models for production deployment","researchers comparing model performance across benchmarks","practitioners understanding model capabilities and limitations"],"limitations":["Benchmark metrics are from Cityscapes validation set — may not reflect performance on other domains","Latency benchmarks are from specific hardware configurations — actual latency varies with hardware, batch size, and image resolution","No per-class performance breakdown in model card — requires manual evaluation to understand class-specific accuracy","Documentation may not cover all edge cases or failure modes discovered in production"],"requires":["Access to HuggingFace model card (online)"],"input_types":["none (documentation only)"],"output_types":["text documentation, benchmark tables, usage examples"],"categories":["image-visual","documentation"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":46,"verified":false,"data_access_risk":"high","permissions":["PyTorch 1.9+","transformers library 4.25+","CUDA 11.0+ for GPU inference (CPU inference possible but ~10x slower)","Minimum 12GB VRAM for batch size 1 at 1024x2048 resolution","PIL/Pillow for image loading and preprocessing","transformers library with Swin implementation","Annotated dataset with per-pixel class labels (PNG masks)","GPU with 12GB+ VRAM","Training code (not provided in model card — requires custom implementation or use of third-party libraries)","HuggingFace account"],"failure_modes":["Model trained exclusively on Cityscapes dataset (European urban streets) — performance degrades significantly on non-urban or geographically different scenes","Requires GPU memory ~11GB for inference on full-resolution images due to Swin-Large backbone size","Inference latency ~200-400ms per image on V100 GPU — not suitable for real-time 30+ FPS applications without optimization","Only supports 19 semantic classes from Cityscapes taxonomy — cannot be directly applied to other domain-specific segmentation tasks without fine-tuning","No built-in batch processing optimization — sequential inference required for multiple images","Shifted-window attention requires image dimensions divisible by 32 — smaller images may need padding that affects edge predictions","Feature extraction is not independently accessible — requires loading full model weights even if only intermediate features are needed","Memory consumption scales quadratically with image resolution due to attention computation in later stages","Fine-tuning requires pixel-level annotations — significantly more expensive than image-level labels","Transfer learning effectiveness depends on domain similarity to Cityscapes — very different domains may require more training data","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.5543187472577048,"quality":0.5,"ecosystem":0.5000000000000001,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.35,"quality":0.2,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.765Z","last_scraped_at":"2026-05-03T14:23:00.162Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":155904,"model_likes":37}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=facebook--mask2former-swin-large-cityscapes-semantic","compare_url":"https://unfragile.ai/compare?artifact=facebook--mask2former-swin-large-cityscapes-semantic"}},"signature":"BAB3g+YaaddxxfbC4yHr2odAlhy/iaXFgANLj9otS3jmdx3RVhvfp/NnqqQ3E79GkFJFCFdFoWkSgzwGyXemAw==","signedAt":"2026-06-20T15:07:39.935Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/facebook--mask2former-swin-large-cityscapes-semantic","artifact":"https://unfragile.ai/facebook--mask2former-swin-large-cityscapes-semantic","verify":"https://unfragile.ai/api/v1/verify?slug=facebook--mask2former-swin-large-cityscapes-semantic","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}