{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"github-vchitect--vbench","slug":"vchitect--vbench","name":"VBench","type":"benchmark","url":"https://vchitect.github.io/VBench-project/","page_url":"https://unfragile.ai/vchitect--vbench","categories":["video-generation"],"tags":["aigc","benchmark","dataset","evaluation-kit","gen-ai","stable-diffusion","text-to-video","video-generation"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"github-vchitect--vbench__cap_0","uri":"capability://data.processing.analysis.multi.dimensional.video.generation.quality.evaluation.with.decomposed.metrics","name":"multi-dimensional video generation quality evaluation with decomposed metrics","description":"Evaluates video generative models across 16-18 fine-grained dimensions (7 technical quality + 9 semantic understanding + 2 intrinsic faithfulness categories) rather than holistic scoring. Uses a modular evaluation pipeline where each dimension is computed independently via specialized pretrained models (CLIP, optical flow, scene detection, action recognition), then aggregated with human-preference-aligned weighting. The architecture separates concerns: quality metrics (resolution, motion smoothness, flicker) run through video processing pipelines, semantic metrics (object consistency, action fidelity) use vision-language models, and trustworthiness dimensions employ anomaly detection and human preference validation.","intents":["Compare text-to-video models on standardized benchmarks with interpretable dimension-level scores","Identify specific quality weaknesses in generated videos (e.g., motion blur vs. temporal consistency)","Validate that video generation improvements on one dimension don't regress others","Rank models on a public leaderboard with reproducible, dimension-aware scoring"],"best_for":["AI researchers evaluating T2V/I2V model architectures","Model developers optimizing for specific quality dimensions","Benchmark maintainers needing standardized, decomposed evaluation"],"limitations":["Requires pretrained models (CLIP, optical flow networks) which add ~2-5 minutes per video evaluation","Dimension scores are model-dependent (CLIP-based semantic metrics may not capture all semantic understanding)","No real-time evaluation — designed for batch assessment of generated videos","Human preference validation limited to annotated subset; may not generalize to all use cases"],"requires":["Python 3.8+","PyTorch 1.9+","CUDA 11.0+ (for GPU acceleration of pretrained models)","Generated video files in MP4 or AVI format","Pretrained model weights (auto-downloaded on first run)"],"input_types":["video files (MP4, AVI, MOV)","text prompts (for T2V evaluation)","image files (for I2V evaluation)"],"output_types":["JSON with per-dimension scores (0-100 scale)","aggregated overall score","dimension-level breakdowns for analysis"],"categories":["data-processing-analysis","evaluation-framework"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_1","uri":"capability://data.processing.analysis.standardized.prompt.suite.generation.and.curation.for.video.model.comparison","name":"standardized prompt suite generation and curation for video model comparison","description":"Maintains curated, balanced prompt datasets for text-to-video evaluation that ensure consistent, fair model comparison. The prompt suite is organized by semantic categories (objects, actions, scenes, attributes) with stratified sampling to cover diverse generation challenges. Prompts are validated against human preference annotations to ensure they discriminate between model quality levels. The system provides both the original VBench prompt set (used in CVPR 2024 leaderboard) and extended suites for I2V and long-video evaluation, with metadata mapping prompts to evaluation dimensions.","intents":["Use standardized prompts to compare video generation models on equal footing","Ensure evaluation prompts cover diverse semantic categories and difficulty levels","Validate that prompt-level scores correlate with human preference judgments","Extend evaluation to new domains (image-to-video, long-form video) with appropriate prompt sets"],"best_for":["Model developers benchmarking against published leaderboards","Researchers conducting comparative studies of T2V/I2V models","Benchmark maintainers ensuring evaluation consistency across submissions"],"limitations":["Prompt suite is fixed to ensure leaderboard reproducibility — cannot be customized per user","Prompts are English-only; no multilingual evaluation support","Semantic category coverage may not match all application domains (e.g., specialized scientific visualization)","Prompt difficulty is not explicitly labeled — users must infer from dimension-level performance"],"requires":["Access to sampled_videos dataset (provided in repo)","Human preference labels for validation (included in VBench-2.0)","Metadata mapping files (YAML configs for subject/background/action mapping)"],"input_types":["text prompts (English)","optional: image files for I2V prompts"],"output_types":["prompt metadata (category, difficulty, dimension alignment)","human preference labels (for validation)","prompt-to-dimension mapping (which prompts test which dimensions)"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_10","uri":"capability://automation.workflow.public.leaderboard.with.dimension.level.ranking.and.model.comparison","name":"public leaderboard with dimension-level ranking and model comparison","description":"Maintains a public leaderboard for ranking video generation models based on VBench evaluation results. The leaderboard displays both overall scores and dimension-level breakdowns, enabling fine-grained model comparison. Implements score normalization and aggregation logic to ensure fair comparison across different model architectures and training approaches. Supports filtering and sorting by dimension, allowing users to identify models that excel in specific areas (e.g., motion quality vs. semantic consistency). The leaderboard infrastructure handles submission validation, duplicate detection, and result archival.","intents":["Compare video generation models on standardized benchmarks","Identify models that excel in specific dimensions","Track model improvements over time as new versions are submitted","Validate that leaderboard submissions use standardized evaluation"],"best_for":["Model developers benchmarking against competitors","Researchers tracking state-of-the-art in video generation","Benchmark maintainers managing community submissions"],"limitations":["Leaderboard only includes submitted models — may not cover all published work","Submissions are self-reported — no independent verification of results","Score normalization may favor certain model types over others","Leaderboard updates are periodic — real-time results not available"],"requires":["Completed VBench evaluation (JSON results file)","Model metadata (name, organization, paper link)","Leaderboard account and API key","Compliance with submission guidelines"],"input_types":["evaluation results JSON file","model metadata (name, version, description, paper link)"],"output_types":["leaderboard ranking (overall and per-dimension)","model comparison visualizations","submission receipt and validation report"],"categories":["automation-workflow","search-retrieval"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_11","uri":"capability://data.processing.analysis.video.processing.pipeline.with.optical.flow.and.frame.analysis","name":"video processing pipeline with optical flow and frame analysis","description":"Implements a modular video processing pipeline that extracts features and metrics from video frames for evaluation. The pipeline includes optical flow computation (using pretrained optical flow networks) for motion analysis, frame-to-frame consistency detection for flicker/jitter measurement, and temporal sampling strategies for efficient processing of long videos. Uses configurable frame sampling (every Nth frame, adaptive sampling based on motion) to balance computational cost and temporal coverage. The pipeline is designed for reusability: computed features (optical flow, frame embeddings) are cached and reused across multiple evaluation dimensions.","intents":["Extract motion and consistency features from generated videos","Compute optical flow for motion quality evaluation","Detect temporal artifacts (flicker, jitter, discontinuities)","Efficiently process long videos with adaptive frame sampling"],"best_for":["Researchers developing motion-aware video metrics","Teams analyzing video quality at frame level","Benchmark maintainers optimizing evaluation efficiency"],"limitations":["Optical flow computation is expensive — typically 1-5 minutes per video","Optical flow quality degrades on fast motion or occlusions","Frame sampling strategy affects metric reliability — requires tuning per video type","Cached features consume significant disk space (1-10GB per 100 videos)"],"requires":["Video files in supported format (MP4, AVI)","Optical flow model weights (auto-downloaded, ~500MB)","Sufficient disk space for feature caching","CUDA-capable GPU for efficient computation"],"input_types":["video files (MP4, AVI, MOV)","optional: frame sampling configuration (stride, adaptive parameters)"],"output_types":["optical flow arrays (per-frame motion vectors)","frame embeddings (CLIP or other vision models)","consistency metrics (flicker, jitter scores)","cached features for reuse"],"categories":["data-processing-analysis","image-visual"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_2","uri":"capability://automation.workflow.distributed.batch.evaluation.pipeline.with.pretrained.model.orchestration","name":"distributed batch evaluation pipeline with pretrained model orchestration","description":"Orchestrates evaluation of multiple videos across distributed compute resources by decomposing the pipeline into independent dimension-computation stages. Each dimension is computed via a specialized pretrained model (CLIP for semantic understanding, optical flow networks for motion metrics, action recognition models for temporal consistency). The pipeline uses a modular architecture where videos are processed sequentially through each dimension's computation graph, with intermediate results cached to avoid redundant model inference. Supports both local and distributed execution via configuration, with automatic GPU memory management and batch processing for efficiency.","intents":["Evaluate hundreds of generated videos efficiently without manual orchestration","Parallelize dimension computation across multiple GPUs or machines","Cache intermediate results to avoid recomputing expensive metrics","Monitor evaluation progress and handle failures gracefully"],"best_for":["Teams running large-scale model evaluations (100+ videos)","Researchers with access to multi-GPU clusters","Benchmark maintainers processing leaderboard submissions"],"limitations":["Distributed execution requires manual cluster setup — no built-in Kubernetes/Slurm integration","Caching is local to machine — no distributed cache across cluster nodes","Memory overhead scales with batch size; typical GPU requires 8-16GB VRAM per dimension","No checkpointing — if evaluation fails mid-pipeline, must restart from beginning"],"requires":["Python 3.8+","PyTorch with CUDA support","Pretrained model weights (auto-downloaded, ~5GB total)","Video files in supported format (MP4, AVI)","Configuration file specifying evaluation dimensions and batch size"],"input_types":["video files (directory path or list)","YAML configuration (dimension selection, batch size, GPU allocation)","optional: cached intermediate results from previous runs"],"output_types":["JSON results per video (dimension scores)","aggregated leaderboard scores","evaluation logs and timing statistics"],"categories":["automation-workflow","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_3","uri":"capability://data.processing.analysis.human.preference.aligned.metric.scoring.with.learned.aggregation.weights","name":"human-preference-aligned metric scoring with learned aggregation weights","description":"Learns dimension-level aggregation weights from human preference annotations to ensure computed metrics correlate with human judgment. The system collects human preference labels for generated videos (e.g., 'video A is better than video B'), then uses these labels to calibrate how individual dimension scores (motion smoothness, semantic consistency, etc.) are weighted in the final aggregated score. This approach ensures that the benchmark's scoring aligns with human perception rather than arbitrary metric combinations. VBench-2.0 extends this with anomaly detection to identify videos that violate human preferences, enabling refinement of the metric suite.","intents":["Ensure benchmark scores correlate with human preference judgments","Learn which dimensions matter most for overall video quality perception","Identify and flag videos where metrics disagree with human judgment","Validate that metric improvements translate to human-perceived quality gains"],"best_for":["Benchmark designers validating metric suites against human judgment","Researchers studying correlation between automated metrics and human perception","Teams deploying video generation models where human satisfaction is critical"],"limitations":["Requires human annotation effort — typically 100-500 preference judgments per dimension","Learned weights may not generalize to new video domains or model architectures","Human preferences are subjective and may vary by demographic or cultural context","Anomaly detection identifies outliers but doesn't automatically improve metrics"],"requires":["Human preference labels (provided in VBench-2.0 dataset)","Computed dimension scores for all videos","Statistical tools for correlation analysis (included in vbench2 package)"],"input_types":["dimension scores (JSON from evaluation pipeline)","human preference labels (pairwise comparisons or rankings)","optional: video metadata for stratified analysis"],"output_types":["learned aggregation weights (per dimension)","correlation coefficients (metric vs. human preference)","anomaly flags (videos where metrics disagree with humans)","calibrated final scores"],"categories":["data-processing-analysis","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_4","uri":"capability://image.visual.image.to.video.i2v.generation.evaluation.with.motion.and.consistency.metrics","name":"image-to-video (i2v) generation evaluation with motion and consistency metrics","description":"Extends evaluation framework to image-to-video generation by adding I2V-specific dimensions that measure motion quality, temporal consistency, and adherence to input image constraints. Implements specialized metrics for evaluating how well generated videos maintain visual consistency with the input image while introducing plausible motion. Uses optical flow analysis to measure motion smoothness, frame-to-frame consistency metrics to detect flickering or jitter, and CLIP-based similarity to ensure the generated video remains faithful to the input image. The I2V evaluation pipeline is integrated into the VBench++ framework with separate prompt suites and dimension definitions.","intents":["Evaluate image-to-video models on motion quality and temporal consistency","Measure how well I2V models preserve input image content while adding motion","Compare I2V models on standardized benchmarks with interpretable metrics","Identify specific I2V failure modes (e.g., content drift, unnatural motion)"],"best_for":["Researchers developing image-to-video generation models","Teams evaluating I2V models for production deployment","Benchmark maintainers extending evaluation to I2V domain"],"limitations":["I2V evaluation requires both input images and generated videos — more data than T2V","Motion metrics are sensitive to optical flow estimation quality — may fail on complex scenes","Consistency metrics assume input image should be preserved — may penalize creative interpretations","Limited to short videos (typically <10 seconds) due to computational constraints"],"requires":["Input image files (PNG, JPG)","Generated video files (MP4, AVI)","Optical flow model (auto-downloaded)","CLIP model for image-video similarity"],"input_types":["image files (input for I2V generation)","video files (generated by I2V model)","optional: motion prompts or action descriptions"],"output_types":["I2V-specific dimension scores (motion quality, consistency, content preservation)","aggregated I2V score","per-frame consistency analysis"],"categories":["image-visual","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_5","uri":"capability://image.visual.long.form.video.generation.evaluation.with.temporal.coherence.and.scene.consistency","name":"long-form video generation evaluation with temporal coherence and scene consistency","description":"Extends evaluation to long-form videos (>10 seconds) by adding dimensions that measure temporal coherence across longer sequences, scene consistency, and subject persistence. Implements specialized metrics for detecting temporal discontinuities (abrupt scene changes, subject disappearance), measuring motion consistency over extended durations, and evaluating semantic coherence across multiple scenes. Uses slow-fast network architectures for efficient long-video processing, with configurable temporal window sizes to balance computational cost and temporal coverage. The VBench-Long framework includes separate prompt suites and evaluation pipelines optimized for long-form content.","intents":["Evaluate long-form video generation models on temporal coherence and scene consistency","Measure subject persistence and motion continuity across extended video sequences","Identify temporal discontinuities and scene-boundary artifacts in generated videos","Compare long-form video models on standardized benchmarks"],"best_for":["Researchers developing long-form video generation models","Teams evaluating models for narrative or cinematic video generation","Benchmark maintainers extending evaluation to longer temporal horizons"],"limitations":["Long-form evaluation is computationally expensive — typically 10-30 minutes per video","Temporal coherence metrics are sensitive to scene boundaries — may produce noisy scores","Subject persistence metrics assume consistent subject appearance — fail on occlusions or transformations","Limited to videos <60 seconds due to memory constraints of temporal models"],"requires":["Long-form video files (MP4, AVI, 10-60 seconds duration)","Slow-fast network weights (auto-downloaded)","Temporal coherence model (included in vbench2_beta_long)","Configuration for temporal window size and stride"],"input_types":["long-form video files (10-60 seconds)","text prompts describing narrative or scene progression","optional: scene boundary annotations"],"output_types":["long-form-specific dimension scores (temporal coherence, scene consistency, subject persistence)","aggregated long-form score","temporal discontinuity detection (frame indices where coherence breaks)"],"categories":["image-visual","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_6","uri":"capability://safety.moderation.trustworthiness.and.safety.evaluation.for.video.generation.models","name":"trustworthiness and safety evaluation for video generation models","description":"Evaluates trustworthiness dimensions of video generation models, including robustness to adversarial prompts, bias detection, and safety compliance. Implements metrics for detecting whether models generate harmful content (violence, explicit material), exhibit demographic biases, or produce anomalous outputs in response to edge-case prompts. Uses human anomaly detection to identify videos that violate safety guidelines, combined with automated classifiers for bias and harmful content detection. The VBench-Trustworthiness framework integrates these dimensions into the overall evaluation pipeline with separate scoring and aggregation logic.","intents":["Evaluate video generation models for safety and trustworthiness before deployment","Detect demographic biases and harmful content generation in video models","Measure robustness to adversarial or edge-case prompts","Compare models on trustworthiness dimensions alongside quality metrics"],"best_for":["Teams deploying video generation models in production environments","Researchers studying safety and bias in generative models","Benchmark maintainers adding trustworthiness evaluation to leaderboards"],"limitations":["Trustworthiness metrics are subjective and culturally dependent — may not generalize across regions","Automated bias detection is imperfect — requires human review for final safety decisions","Adversarial prompt suite is limited — may not cover all potential safety issues","Trustworthiness scores are not directly comparable to quality scores — require separate interpretation"],"requires":["Generated videos from safety-relevant prompts (adversarial, edge-case, demographic)","Human annotations for anomaly detection (provided in VBench-Trustworthiness)","Bias detection models (auto-downloaded)","Safety classifier (included in vbench2 package)"],"input_types":["video files generated from safety-relevant prompts","text prompts (adversarial, edge-case, demographic-sensitive)","optional: human annotations for anomaly validation"],"output_types":["trustworthiness dimension scores (safety, bias, robustness)","anomaly flags (videos violating safety guidelines)","bias detection results (demographic representation analysis)","aggregated trustworthiness score"],"categories":["safety-moderation","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_7","uri":"capability://data.processing.analysis.intrinsic.faithfulness.evaluation.with.prompt.to.video.alignment.metrics","name":"intrinsic faithfulness evaluation with prompt-to-video alignment metrics","description":"Evaluates intrinsic faithfulness — the degree to which generated videos align with input prompts — across 18 fine-grained dimensions organized into 5 categories: object fidelity, spatial relationships, temporal dynamics, appearance consistency, and semantic understanding. Uses multimodal models (CLIP, action recognition, scene understanding) to measure alignment between prompt descriptions and generated video content. Implements specialized metrics for each faithfulness dimension (e.g., object presence detection, spatial relationship verification, action execution quality). VBench-2.0 extends the core framework with these intrinsic faithfulness dimensions, validated against human preference annotations.","intents":["Measure how faithfully generated videos match input text prompts","Identify specific faithfulness gaps (e.g., missing objects, incorrect spatial relationships)","Evaluate semantic understanding of complex prompts in video generation","Compare models on prompt-alignment quality with interpretable dimension-level scores"],"best_for":["Researchers developing prompt-aware video generation models","Teams evaluating models for applications requiring high prompt fidelity","Benchmark maintainers measuring semantic understanding in video generation"],"limitations":["Faithfulness metrics depend on CLIP and action recognition model quality — may miss semantic nuances","Prompt complexity affects metric reliability — simple prompts produce more reliable scores","Spatial relationship metrics are limited to 2D video frames — cannot measure 3D spatial accuracy","Temporal dynamics metrics assume linear time progression — may fail on non-linear narratives"],"requires":["Text prompts (English, describing desired video content)","Generated video files (MP4, AVI)","CLIP model for prompt-video alignment","Action recognition model for temporal dynamics evaluation","Scene understanding model for spatial relationship verification"],"input_types":["text prompts (English descriptions of video content)","video files (generated by T2V model)","optional: structured prompt annotations (objects, actions, spatial relationships)"],"output_types":["faithfulness dimension scores (object fidelity, spatial relationships, temporal dynamics, appearance, semantics)","aggregated faithfulness score","per-dimension alignment analysis","prompt-video alignment visualization (optional)"],"categories":["data-processing-analysis","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_8","uri":"capability://automation.workflow.command.line.interface.for.batch.evaluation.and.leaderboard.submission","name":"command-line interface for batch evaluation and leaderboard submission","description":"Provides a unified CLI for running evaluations, managing configurations, and submitting results to the VBench leaderboard. The CLI supports both VBench (16 dimensions) and VBench-2.0 (18 dimensions) evaluation modes, with configuration-driven dimension selection, batch processing, and result aggregation. Implements subcommands for evaluation (vbench2 evaluate), leaderboard submission (vbench2 submit), and result visualization. The CLI handles model weight downloading, GPU memory management, and error recovery, abstracting away implementation details while exposing key parameters (batch size, dimension selection, output format).","intents":["Run standardized evaluations without writing Python code","Submit model evaluation results to the VBench leaderboard","Configure evaluation dimensions and parameters via YAML files","Monitor evaluation progress and retrieve results in standard formats"],"best_for":["Model developers benchmarking against published leaderboards","Researchers running evaluations without deep framework knowledge","Benchmark maintainers processing leaderboard submissions"],"limitations":["CLI is Python-only — requires Python 3.8+ installation","Configuration is YAML-based — limited validation of invalid parameter combinations","No interactive progress visualization — only log-based status updates","Leaderboard submission requires manual account creation and API key management"],"requires":["Python 3.8+","vbench or vbench2 package (pip install vbench2)","Video files in supported format","YAML configuration file (examples provided in repo)","Optional: API key for leaderboard submission"],"input_types":["video directory path","YAML configuration file","optional: model metadata (name, version, description)"],"output_types":["JSON results file (dimension scores, aggregated score)","evaluation logs (timing, errors, warnings)","leaderboard submission receipt (if submitted)"],"categories":["automation-workflow","tool-use-integration"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"github-vchitect--vbench__cap_9","uri":"capability://tool.use.integration.python.api.for.programmatic.evaluation.and.custom.metric.integration","name":"python api for programmatic evaluation and custom metric integration","description":"Exposes a Python API for programmatic evaluation of video generation models, enabling integration into custom workflows and metric development. The API provides core classes (VBench, VBench2) that orchestrate evaluation pipelines, with methods for computing individual dimensions, aggregating scores, and retrieving detailed results. Supports custom metric registration via a plugin architecture, allowing researchers to add new dimensions without modifying core code. The API is designed for flexibility: users can evaluate single videos, batch process directories, or integrate evaluation into training loops.","intents":["Integrate VBench evaluation into custom training or optimization pipelines","Develop and test new evaluation dimensions before adding to official benchmark","Programmatically analyze evaluation results and generate custom reports","Extend VBench with domain-specific metrics"],"best_for":["Researchers developing new video generation models and metrics","Teams integrating evaluation into automated ML pipelines","Metric developers prototyping new dimensions"],"limitations":["API documentation is limited — requires reading source code for advanced usage","Custom metric registration requires understanding internal architecture","No built-in support for distributed evaluation across multiple machines","API stability is not guaranteed across minor versions"],"requires":["Python 3.8+","vbench or vbench2 package","PyTorch and CUDA (for GPU acceleration)","Familiarity with Python and PyTorch"],"input_types":["video file paths (single or batch)","optional: custom configuration dictionaries","optional: custom metric functions"],"output_types":["dimension scores (dict or DataFrame)","aggregated scores","detailed evaluation logs","intermediate results (optical flow, CLIP embeddings, etc.)"],"categories":["tool-use-integration","code-generation-editing"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":35,"verified":false,"data_access_risk":"low","permissions":["Python 3.8+","PyTorch 1.9+","CUDA 11.0+ (for GPU acceleration of pretrained models)","Generated video files in MP4 or AVI format","Pretrained model weights (auto-downloaded on first run)","Access to sampled_videos dataset (provided in repo)","Human preference labels for validation (included in VBench-2.0)","Metadata mapping files (YAML configs for subject/background/action mapping)","Completed VBench evaluation (JSON results file)","Model metadata (name, organization, paper link)"],"failure_modes":["Requires pretrained models (CLIP, optical flow networks) which add ~2-5 minutes per video evaluation","Dimension scores are model-dependent (CLIP-based semantic metrics may not capture all semantic understanding)","No real-time evaluation — designed for batch assessment of generated videos","Human preference validation limited to annotated subset; may not generalize to all use cases","Prompt suite is fixed to ensure leaderboard reproducibility — cannot be customized per user","Prompts are English-only; no multilingual evaluation support","Semantic category coverage may not match all application domains (e.g., specialized scientific visualization)","Prompt difficulty is not explicitly labeled — users must infer from dimension-level performance","Leaderboard only includes submitted models — may not cover all published work","Submissions are self-reported — no independent verification of results","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.2567027582803721,"quality":0.34,"ecosystem":0.6000000000000001,"match_graph":0.25,"freshness":0.6,"weights":{"adoption":0.25,"quality":0.35,"ecosystem":0.15,"match_graph":0.2,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.064Z","last_scraped_at":"2026-05-03T13:59:47.981Z","last_commit":"2026-03-23T08:32:27Z"},"community":{"stars":1615,"forks":114,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=vchitect--vbench","compare_url":"https://unfragile.ai/compare?artifact=vchitect--vbench"}},"signature":"ng3Wxm3gaj12MWVnLzagTSZNwFXO4CMj116aYw5+QyOJs9ONAmAuSkwQGlALoHaez0QIOrJcxprG4j6++JQmAQ==","signedAt":"2026-06-22T13:10:00.057Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/vchitect--vbench","artifact":"https://unfragile.ai/vchitect--vbench","verify":"https://unfragile.ai/api/v1/verify?slug=vchitect--vbench","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}