{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-model-pyannote--voice-activity-detection","slug":"pyannote--voice-activity-detection","name":"voice-activity-detection","type":"model","url":"https://huggingface.co/pyannote/voice-activity-detection","page_url":"https://unfragile.ai/pyannote--voice-activity-detection","categories":["voice-audio"],"tags":["pyannote-audio","pyannote","pyannote-audio-pipeline","audio","voice","speech","speaker","voice-activity-detection","automatic-speech-recognition","dataset:ami","dataset:dihard","dataset:voxconverse","license:mit","region:us"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-model-pyannote--voice-activity-detection__cap_0","uri":"capability://data.processing.analysis.frame.level.voice.activity.classification.with.temporal.smoothing","name":"frame-level voice activity classification with temporal smoothing","description":"Classifies audio frames (typically 10-20ms windows) as speech or non-speech using a neural encoder-classifier architecture trained on multi-domain speech corpora. Applies temporal smoothing via post-processing to reduce frame-level noise and produce stable speech/silence segments. The model uses a segmentation-based approach rather than endpoint detection, enabling detection of speech activity within longer audio streams without requiring explicit start/end markers.","intents":["I need to identify where speech occurs in a long audio file to segment it for downstream processing","I want to remove silence and background noise from recordings before transcription to improve ASR accuracy","I need to detect speech activity in real-time streaming audio to trigger recording or processing pipelines","I want to extract only the speech portions from mixed audio containing music, noise, and silence"],"best_for":["speech processing engineers building ASR pipelines","audio preprocessing teams working with noisy or mixed-source recordings","developers implementing speaker diarization systems that require speech segmentation as a prerequisite","teams building real-time voice interaction systems needing low-latency speech detection"],"limitations":["Frame-level predictions require post-processing smoothing; raw outputs are noisy without temporal aggregation","Performance degrades on heavily accented speech or non-English languages not well-represented in training data (trained primarily on English, French, Spanish corpora)","Requires minimum audio duration context (~500ms) for stable predictions; very short utterances may be misclassified","No speaker-aware filtering; cannot distinguish between multiple simultaneous speakers or prioritize specific speaker voices","Latency increases with audio length; batch processing recommended for files >10 minutes to avoid memory overhead"],"requires":["Python 3.7+","PyTorch 1.9+ (CPU or GPU)","librosa or torchaudio for audio loading and preprocessing","pyannote.audio library (provides model wrapper and inference utilities)","Audio input at 16kHz sample rate (model expects this; resampling required for other rates)"],"input_types":["audio file (WAV, MP3, FLAC, OGG)","raw audio array (numpy array or torch tensor)","streaming audio buffer (with frame-based processing)"],"output_types":["frame-level binary labels (speech/non-speech per frame)","temporal segments with start/end timestamps and confidence scores","smoothed speech activity timeline (timeline object in pyannote format)"],"categories":["data-processing-analysis","audio-processing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-pyannote--voice-activity-detection__cap_1","uri":"capability://data.processing.analysis.multi.domain.speech.activity.detection.with.cross.dataset.generalization","name":"multi-domain speech activity detection with cross-dataset generalization","description":"Generalizes voice activity detection across diverse acoustic domains (meetings, broadcast, conversational speech, telephony) through training on heterogeneous datasets (AMI, DIHARD, VoxConverse) with domain-agnostic feature learning. The model learns invariant representations that transfer across different microphone types, background noise profiles, and speaker characteristics without requiring domain adaptation or fine-tuning per use case.","intents":["I need a single VAD model that works reliably on meeting recordings, podcast audio, and phone calls without retraining","I want to process diverse audio sources (YouTube, Zoom, broadcast) with consistent speech detection quality","I need to deploy VAD in production without collecting and labeling domain-specific training data","I want to avoid maintaining separate models for different audio sources and microphone types"],"best_for":["production systems processing heterogeneous audio sources","teams without domain-specific labeled data for fine-tuning","multi-tenant platforms serving diverse use cases (transcription, analytics, archival)","researchers benchmarking VAD performance across standard datasets"],"limitations":["Cross-domain training may sacrifice peak performance on any single domain compared to domain-specific models","Generalization breaks down on highly specialized domains (e.g., underwater acoustics, extreme noise environments >80dB SNR)","No explicit handling of code-switching or multilingual speech; performance varies by language representation in training data","Requires sufficient training data diversity; performance on truly novel domains (e.g., synthetic speech, heavily processed audio) is unpredictable"],"requires":["Python 3.7+","PyTorch 1.9+","pyannote.audio library with pretrained weights (~50MB model file)","Access to HuggingFace model hub or local model cache","Audio preprocessing pipeline (resampling to 16kHz, normalization)"],"input_types":["audio file from diverse sources (meeting recordings, podcasts, broadcasts, phone calls)","raw audio arrays with varying sample rates and bit depths","streaming audio with variable background noise and microphone characteristics"],"output_types":["speech activity timeline with temporal boundaries","frame-level confidence scores indicating speech probability","domain-agnostic speech segments suitable for downstream processing"],"categories":["data-processing-analysis","audio-processing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-pyannote--voice-activity-detection__cap_2","uri":"capability://data.processing.analysis.low.latency.streaming.voice.activity.detection.with.frame.buffering","name":"low-latency streaming voice activity detection with frame buffering","description":"Processes audio in fixed-size frames (typically 10-20ms windows) enabling real-time or near-real-time VAD on streaming audio without requiring the full audio file upfront. Uses a sliding window buffer to maintain temporal context for smoothing while emitting predictions with minimal latency (~100-200ms depending on frame size and post-processing window). Suitable for live transcription, voice command detection, and interactive voice applications where latency is critical.","intents":["I need to detect speech activity in real-time streaming audio for voice assistant applications","I want to trigger recording or transcription pipelines with minimal delay when speech is detected","I need to process live audio from microphone input without buffering the entire stream","I want to implement voice-activated features with sub-second response latency"],"best_for":["real-time voice assistant and voice command systems","live transcription and captioning applications","voice-activated IoT devices with latency constraints","interactive voice interaction systems requiring immediate feedback"],"limitations":["Streaming predictions are less stable than batch processing; frame-level noise requires aggressive smoothing which adds latency","Minimum context window (~500ms) needed for reliable predictions; very short audio clips may be misclassified","Memory footprint grows with context window size; very long smoothing windows (>5 seconds) may cause memory issues on edge devices","Requires careful tuning of frame size and smoothing window for optimal latency/accuracy tradeoff; no automatic parameter selection","State management required for streaming; stateless inference not possible without losing temporal context"],"requires":["Python 3.7+ or compatible runtime (C++/ONNX for edge deployment)","PyTorch 1.9+ or ONNX Runtime for inference","Audio streaming interface (e.g., pyaudio, sounddevice for microphone input)","Frame-based audio buffering mechanism (typically 160-320 samples per frame at 16kHz)","Temporal smoothing post-processor (included in pyannote.audio.pipelines)"],"input_types":["streaming audio buffer (frame-by-frame)","microphone input stream","network audio stream (RTP, WebRTC)"],"output_types":["real-time speech/non-speech labels per frame","smoothed speech activity events with timestamps","confidence scores for speech probability"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-pyannote--voice-activity-detection__cap_3","uri":"capability://data.processing.analysis.confidence.scored.speech.segmentation.with.temporal.boundaries","name":"confidence-scored speech segmentation with temporal boundaries","description":"Produces speech activity segments with precise start/end timestamps and per-segment confidence scores indicating model certainty. Converts frame-level predictions into segment-level output through boundary detection and merging algorithms, enabling downstream tasks to filter low-confidence segments or adjust processing based on speech reliability. Confidence scores reflect model uncertainty and can be used for adaptive processing (e.g., higher thresholds for noisy audio).","intents":["I need precise timestamps for speech segments to align with transcripts or other annotations","I want to filter out low-confidence speech detections to reduce false positives in noisy audio","I need confidence scores to implement adaptive processing (e.g., stricter thresholds for uncertain segments)","I want to merge nearby speech segments or split long segments for downstream processing"],"best_for":["speech processing pipelines requiring precise temporal alignment","quality assurance systems filtering low-confidence detections","adaptive audio processing systems adjusting parameters based on confidence","annotation and labeling workflows needing segment-level metadata"],"limitations":["Confidence scores are model-calibrated but not necessarily well-calibrated for all domains; may not reflect true error probability","Segment merging and boundary detection algorithms require tuning; default parameters may not suit all use cases","Very short segments (<100ms) may have unreliable confidence scores due to insufficient temporal context","Confidence scores do not account for downstream task requirements; a high-confidence non-speech segment may still be important for context","No speaker-level confidence; cannot distinguish between confident speech from one speaker vs. uncertain speech from another"],"requires":["Python 3.7+","pyannote.audio library with segmentation post-processor","Frame-level VAD predictions (from voice-activity-detection model)","Temporal smoothing configuration (window size, merge threshold)"],"input_types":["frame-level binary or soft predictions (0-1 confidence per frame)","temporal audio timeline with frame timestamps"],"output_types":["speech segments with start/end timestamps (millisecond precision)","per-segment confidence scores (0-1 range)","segment metadata (duration, confidence, frame count)"],"categories":["data-processing-analysis","audio-processing"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-pyannote--voice-activity-detection__cap_4","uri":"capability://data.processing.analysis.pretrained.feature.extraction.for.downstream.speech.tasks","name":"pretrained feature extraction for downstream speech tasks","description":"Exposes learned acoustic representations from the VAD model's encoder as features for downstream tasks (speaker diarization, speaker verification, emotion recognition). The model's internal representations capture speech-relevant acoustic patterns learned from multi-domain training, enabling transfer learning without retraining from scratch. Features can be extracted at frame-level or aggregated to segment-level for use in other models.","intents":["I want to use pretrained speech features for speaker diarization without training a new encoder","I need acoustic representations for speaker verification or voice biometrics","I want to extract embeddings for speech emotion or intent recognition","I need to build a custom speech processing model with pretrained features as input"],"best_for":["researchers building speech processing systems with limited labeled data","teams implementing speaker diarization pipelines (pyannote/speaker-diarization uses these features)","developers creating custom speech analysis models leveraging transfer learning","production systems needing efficient feature extraction without training overhead"],"limitations":["Features are optimized for VAD task; may not be optimal for all downstream tasks without fine-tuning","Feature dimensionality is fixed (~512-1024 dims depending on model variant); cannot be customized without retraining","No guarantee of feature stability across model versions; updates may change feature representations","Requires GPU for efficient extraction; CPU extraction is slow for large audio files","Features are domain-specific to speech; not suitable for non-speech audio tasks"],"requires":["Python 3.7+","PyTorch 1.9+ with GPU support recommended","pyannote.audio library with feature extraction utilities","Access to model weights and encoder architecture","Sufficient memory for batch feature extraction (GPU VRAM or CPU RAM)"],"input_types":["audio files or arrays","streaming audio with frame-based extraction"],"output_types":["frame-level embeddings (typically 512-1024 dimensions)","segment-level aggregated embeddings (mean/max pooling)","feature matrices suitable for downstream ML models"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":51,"verified":false,"data_access_risk":"low","permissions":["Python 3.7+","PyTorch 1.9+ (CPU or GPU)","librosa or torchaudio for audio loading and preprocessing","pyannote.audio library (provides model wrapper and inference utilities)","Audio input at 16kHz sample rate (model expects this; resampling required for other rates)","PyTorch 1.9+","pyannote.audio library with pretrained weights (~50MB model file)","Access to HuggingFace model hub or local model cache","Audio preprocessing pipeline (resampling to 16kHz, normalization)","Python 3.7+ or compatible runtime (C++/ONNX for edge deployment)"],"failure_modes":["Frame-level predictions require post-processing smoothing; raw outputs are noisy without temporal aggregation","Performance degrades on heavily accented speech or non-English languages not well-represented in training data (trained primarily on English, French, Spanish corpora)","Requires minimum audio duration context (~500ms) for stable predictions; very short utterances may be misclassified","No speaker-aware filtering; cannot distinguish between multiple simultaneous speakers or prioritize specific speaker voices","Latency increases with audio length; batch processing recommended for files >10 minutes to avoid memory overhead","Cross-domain training may sacrifice peak performance on any single domain compared to domain-specific models","Generalization breaks down on highly specialized domains (e.g., underwater acoustics, extreme noise environments >80dB SNR)","No explicit handling of code-switching or multilingual speech; performance varies by language representation in training data","Requires sufficient training data diversity; performance on truly novel domains (e.g., synthetic speech, heavily processed audio) is unpredictable","Streaming predictions are less stable than batch processing; frame-level noise requires aggressive smoothing which adds latency","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.8017302816850221,"quality":0.35,"ecosystem":0.5000000000000001,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.35,"quality":0.2,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.765Z","last_scraped_at":"2026-05-03T14:22:52.900Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":3094665,"model_likes":231}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=pyannote--voice-activity-detection","compare_url":"https://unfragile.ai/compare?artifact=pyannote--voice-activity-detection"}},"signature":"ZoVqHeOzashk2wLno+kMtUmnlsWTAUbJssij5NZvlnF2E5Qd3XqKDuIYgrl5B+bdAX5+Lc942KT69YUd/PmfCQ==","signedAt":"2026-06-22T06:44:44.069Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/pyannote--voice-activity-detection","artifact":"https://unfragile.ai/pyannote--voice-activity-detection","verify":"https://unfragile.ai/api/v1/verify?slug=pyannote--voice-activity-detection","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}