{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-model-distil-whisper--distil-large-v3","slug":"distil-whisper--distil-large-v3","name":"distil-large-v3","type":"model","url":"https://huggingface.co/distil-whisper/distil-large-v3","page_url":"https://unfragile.ai/distil-whisper--distil-large-v3","categories":["voice-audio"],"tags":["transformers","jax","tensorboard","onnx","safetensors","whisper","automatic-speech-recognition","audio","transformers.js","en","arxiv:2311.00430","arxiv:2210.13352","license:mit","eval-results","endpoints_compatible","region:us"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-model-distil-whisper--distil-large-v3__cap_0","uri":"capability://data.processing.analysis.multilingual.speech.to.text.transcription","name":"multilingual-speech-to-text-transcription","description":"Converts audio streams into text across 99 languages using a distilled Whisper encoder-decoder architecture that reduces the original Whisper model by ~49% while maintaining accuracy. The model uses cross-attention between audio mel-spectrogram features and learned token embeddings, processing variable-length audio through a convolutional feature extractor followed by transformer layers. Distillation was applied via knowledge transfer from the full Whisper large model, enabling efficient inference on CPU and edge devices.","intents":["I need to transcribe audio files in multiple languages with minimal latency on resource-constrained devices","I want to build a real-time speech recognition pipeline that doesn't require GPU acceleration","I need to process large volumes of audio data cost-effectively without cloud API calls","I'm building a multilingual voice assistant that needs to run locally for privacy"],"best_for":["developers building privacy-first voice applications","teams deploying speech recognition to edge devices or mobile","organizations processing multilingual audio at scale with cost constraints","researchers implementing speech-to-text in low-resource environments"],"limitations":["Distillation reduces model capacity — accuracy on specialized domains (medical, technical jargon) may degrade vs full Whisper large","No built-in speaker diarization or speaker identification — outputs single continuous transcript","Requires audio preprocessing (resampling to 16kHz mono) — raw audio formats need conversion","Inference speed varies significantly by hardware — CPU inference on long audio (>30min) may exceed real-time factor of 1x","No streaming/chunked inference support in base model — requires full audio buffering before transcription"],"requires":["Python 3.8+","transformers library (>=4.30.0)","librosa or similar audio processing library for preprocessing","Audio input at 16kHz sample rate (mono or stereo)","~3GB disk space for model weights (safetensors format)","PyTorch or JAX runtime (model supports both via transformers)"],"input_types":["audio/wav","audio/mp3","audio/flac","audio/ogg","raw PCM samples (numpy array)","audio file paths"],"output_types":["text (transcription)","structured JSON with token-level timing information","language detection confidence scores"],"categories":["data-processing-analysis","speech-recognition"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-distil-whisper--distil-large-v3__cap_1","uri":"capability://data.processing.analysis.language.identification.from.audio","name":"language-identification-from-audio","description":"Automatically detects the spoken language in audio input by analyzing the acoustic features through the encoder portion of the distilled Whisper model, which learns language-specific phonetic patterns during training. The model outputs language probabilities across 99 supported languages, allowing downstream systems to route transcription or handle multilingual content appropriately. Language detection occurs as a byproduct of the transcription process without additional inference passes.","intents":["I need to automatically route audio to the correct language-specific transcription pipeline","I want to detect the primary language in mixed-language audio for content classification","I'm building a system that needs to handle user-submitted audio in unknown languages","I need language detection confidence scores to decide whether to apply language-specific post-processing"],"best_for":["multilingual content platforms requiring automatic language routing","speech analytics systems processing diverse user-generated audio","voice applications needing to adapt behavior based on detected language","data preprocessing pipelines for multilingual training datasets"],"limitations":["Language detection accuracy depends on audio duration — clips <3 seconds may have high false positive rates","Cannot distinguish between similar language variants (e.g., Mandarin vs Cantonese) — treats them as single language class","Confidence scores are not calibrated probabilities — relative ranking is reliable but absolute values should not be thresholded without validation","Requires sufficient phonetic content — silence, music, or non-speech audio may produce unreliable language predictions"],"requires":["Python 3.8+","transformers library (>=4.30.0)","Audio input at 16kHz sample rate","Minimum audio duration of ~2-3 seconds for reliable detection"],"input_types":["audio/wav","audio/mp3","audio/flac","raw PCM samples (numpy array)"],"output_types":["language code (ISO 639-1 or custom format)","confidence scores across 99 language classes","top-k language predictions with probabilities"],"categories":["data-processing-analysis","language-detection"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-distil-whisper--distil-large-v3__cap_2","uri":"capability://automation.workflow.cpu.optimized.inference.with.quantization.support","name":"cpu-optimized-inference-with-quantization-support","description":"Enables efficient inference on CPU and edge devices through support for multiple model formats (PyTorch, JAX, ONNX) and quantization strategies. The model can be loaded in float32, float16, or quantized int8 formats depending on hardware constraints, with ONNX export enabling runtime optimization via ONNX Runtime's graph optimization and operator fusion. The distilled architecture (49% smaller than Whisper large) combined with quantization can reduce memory footprint to <1GB, enabling deployment on devices with limited RAM.","intents":["I need to run speech recognition on a laptop or mobile device without GPU","I want to minimize model size and memory usage for embedded or IoT deployment","I'm deploying to a server with CPU-only constraints and need to maximize throughput","I need to quantize the model to int8 for faster inference on edge hardware"],"best_for":["edge device developers (Raspberry Pi, Jetson Nano, mobile phones)","on-premise deployment teams avoiding cloud costs","privacy-focused applications requiring local-only inference","resource-constrained environments (embedded systems, IoT)"],"limitations":["Quantization to int8 introduces 1-3% accuracy degradation on average, with larger gaps on low-resource languages","ONNX export requires manual conversion — not all model variants are pre-exported","CPU inference speed is highly hardware-dependent — ARM processors (mobile) are 5-10x slower than x86 CPUs","No built-in batching optimization for CPU — processing multiple audio files sequentially is inefficient","JAX backend requires additional dependencies and is less mature than PyTorch for this model"],"requires":["Python 3.8+","transformers library (>=4.30.0)","PyTorch (>=1.9.0) OR JAX (>=0.3.0) OR ONNX Runtime (>=1.14.0)","For quantization: onnx and onnxruntime libraries","Minimum 2GB RAM for float32 inference, 1GB for quantized int8","CPU with at least 2 cores for reasonable inference speed"],"input_types":["audio/wav","audio/mp3","audio/flac","raw PCM samples (numpy array)"],"output_types":["text (transcription)","structured JSON with timing information"],"categories":["automation-workflow","model-optimization"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-distil-whisper--distil-large-v3__cap_3","uri":"capability://data.processing.analysis.batch.audio.processing.with.variable.length.handling","name":"batch-audio-processing-with-variable-length-handling","description":"Processes multiple audio files of varying lengths in a single inference pass by padding shorter sequences and masking padded positions in the attention mechanism. The model's convolutional feature extractor handles variable-length mel-spectrograms, and the transformer encoder uses attention masks to prevent the model from attending to padding tokens. Batch processing reduces per-sample overhead and enables efficient GPU/CPU utilization when processing datasets.","intents":["I need to transcribe a folder of audio files with different durations efficiently","I want to maximize GPU/CPU utilization by processing multiple audio files in parallel","I'm building a batch processing pipeline for large audio datasets","I need to handle audio files ranging from 5 seconds to 30 minutes in a single batch"],"best_for":["data processing teams handling large audio corpora","batch transcription services processing overnight jobs","researchers preparing multilingual speech datasets","content platforms transcribing user-uploaded audio in bulk"],"limitations":["Batch size is limited by available memory — very long audio (>30min) may require batch_size=1","Padding overhead increases with length variance — batching 5-second clips with 30-minute files wastes computation","No built-in progress tracking or checkpointing — long batches may fail without recovery mechanism","Output ordering must be manually tracked — batch processing doesn't preserve input file metadata","Memory usage scales linearly with batch size and max sequence length in batch"],"requires":["Python 3.8+","transformers library (>=4.30.0)","PyTorch or JAX with batch processing support","Sufficient RAM: ~500MB per audio hour in batch + overhead","Audio preprocessing pipeline to normalize sample rates and formats"],"input_types":["list of audio file paths","list of numpy arrays (PCM samples)","list of mel-spectrogram tensors"],"output_types":["list of transcriptions (text)","structured JSON with per-sample timing and language info","batch processing metrics (throughput, latency)"],"categories":["data-processing-analysis","batch-processing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-distil-whisper--distil-large-v3__cap_4","uri":"capability://tool.use.integration.onnx.export.and.cross.platform.inference","name":"onnx-export-and-cross-platform-inference","description":"Exports the distilled Whisper model to ONNX (Open Neural Network Exchange) format, enabling inference across diverse platforms (Windows, Linux, macOS, mobile, web browsers) using ONNX Runtime. The export process converts PyTorch operations to ONNX opset 14+, preserving the encoder-decoder architecture and attention mechanisms. ONNX Runtime applies graph-level optimizations (operator fusion, constant folding) and supports hardware-specific execution providers (CPU, GPU, CoreML for iOS, NNAPI for Android).","intents":["I need to deploy the model to multiple platforms (web, mobile, desktop) from a single export","I want to use ONNX Runtime optimizations to speed up inference on specific hardware","I'm building a cross-platform application and need a unified model format","I need to run the model in a web browser using ONNX.js or similar runtime"],"best_for":["cross-platform application developers (Electron, React Native, Flutter)","web application teams deploying ML models to browsers","mobile app developers targeting iOS and Android","DevOps teams managing model deployment across heterogeneous infrastructure"],"limitations":["ONNX export requires manual conversion — not all model variants are pre-exported to ONNX","ONNX Runtime performance varies by execution provider — CPU inference may be slower than optimized PyTorch on some hardware","Web browser inference (ONNX.js) is significantly slower than native runtimes due to JavaScript overhead","Mobile ONNX Runtime support is less mature than PyTorch Mobile — some operators may not be supported","ONNX model files are typically larger than PyTorch checkpoints due to format overhead"],"requires":["Python 3.8+ for export","PyTorch (>=1.9.0) for export","onnx and onnxruntime libraries","ONNX Runtime (>=1.14.0) for inference","Platform-specific runtime: ONNX Runtime CPU, GPU, or mobile variants","For web: ONNX.js or similar browser-compatible runtime"],"input_types":["audio/wav","audio/mp3","audio/flac","raw PCM samples (numpy array, JavaScript typed arrays)"],"output_types":["text (transcription)","structured JSON with timing information","platform-specific output formats (JavaScript objects, native data structures)"],"categories":["tool-use-integration","model-deployment"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-distil-whisper--distil-large-v3__cap_5","uri":"capability://data.processing.analysis.token.level.timing.and.alignment.extraction","name":"token-level-timing-and-alignment-extraction","description":"Extracts precise timing information for each generated token (word or subword) by tracking the decoder's output positions and mapping them back to input audio timestamps. The model outputs token-level alignments through the decoder's attention weights over the encoder output, enabling applications to determine exactly when each word was spoken. This is achieved by preserving the encoder-decoder attention patterns during inference and post-processing them to align tokens with audio frames.","intents":["I need to generate subtitle files with precise word-level timing (SRT, VTT formats)","I want to highlight spoken words in real-time as audio plays","I'm building a speech analytics system that needs to correlate events with specific words","I need to create searchable transcripts where users can click to jump to specific words"],"best_for":["video/media production teams creating subtitles","accessibility teams building caption systems","speech analytics and call center platforms","interactive media applications with word-level synchronization"],"limitations":["Token-level timing accuracy degrades on noisy audio or accented speech — alignment errors can be 100-500ms","Timing information is approximate, not frame-accurate — suitable for subtitles but not for precise audio-visual sync","Requires post-processing to convert token alignments to word-level timing — raw output is subword-level (BPE tokens)","Attention-based alignment can be ambiguous for repeated words or long pauses — may require heuristic disambiguation","No built-in support for speaker-specific timing — all words are aligned to a single timeline"],"requires":["Python 3.8+","transformers library (>=4.30.0) with attention output enabled","Post-processing library for token-to-word alignment (custom or third-party)","Audio input at 16kHz sample rate"],"input_types":["audio/wav","audio/mp3","audio/flac","raw PCM samples (numpy array)"],"output_types":["JSON with token-level timing: [{token, start_time, end_time}, ...]","SRT/VTT subtitle format","structured alignment data for downstream processing"],"categories":["data-processing-analysis","alignment-extraction"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":50,"verified":false,"data_access_risk":"low","permissions":["Python 3.8+","transformers library (>=4.30.0)","librosa or similar audio processing library for preprocessing","Audio input at 16kHz sample rate (mono or stereo)","~3GB disk space for model weights (safetensors format)","PyTorch or JAX runtime (model supports both via transformers)","Audio input at 16kHz sample rate","Minimum audio duration of ~2-3 seconds for reliable detection","PyTorch (>=1.9.0) OR JAX (>=0.3.0) OR ONNX Runtime (>=1.14.0)","For quantization: onnx and onnxruntime libraries"],"failure_modes":["Distillation reduces model capacity — accuracy on specialized domains (medical, technical jargon) may degrade vs full Whisper large","No built-in speaker diarization or speaker identification — outputs single continuous transcript","Requires audio preprocessing (resampling to 16kHz mono) — raw audio formats need conversion","Inference speed varies significantly by hardware — CPU inference on long audio (>30min) may exceed real-time factor of 1x","No streaming/chunked inference support in base model — requires full audio buffering before transcription","Language detection accuracy depends on audio duration — clips <3 seconds may have high false positive rates","Cannot distinguish between similar language variants (e.g., Mandarin vs Cantonese) — treats them as single language class","Confidence scores are not calibrated probabilities — relative ranking is reliable but absolute values should not be thresholded without validation","Requires sufficient phonetic content — silence, music, or non-speech audio may produce unreliable language predictions","Quantization to int8 introduces 1-3% accuracy degradation on average, with larger gaps on low-resource languages","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.7523528611464299,"quality":0.37,"ecosystem":0.5000000000000001,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.35,"quality":0.2,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.765Z","last_scraped_at":"2026-05-03T14:22:52.901Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":1305832,"model_likes":376}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=distil-whisper--distil-large-v3","compare_url":"https://unfragile.ai/compare?artifact=distil-whisper--distil-large-v3"}},"signature":"mEE3kReSa/UsmSgNTCmdVxnjHPKHNoks5hYEZLGMiG3lOqT46oq5NYCDLaj1535VeaCxIVAdxTubQSOCytprDQ==","signedAt":"2026-06-22T04:35:36.182Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/distil-whisper--distil-large-v3","artifact":"https://unfragile.ai/distil-whisper--distil-large-v3","verify":"https://unfragile.ai/api/v1/verify?slug=distil-whisper--distil-large-v3","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}