{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-model-qwen--qwen2.5-3b-instruct","slug":"qwen--qwen2.5-3b-instruct","name":"Qwen2.5-3B-Instruct","type":"model","url":"https://huggingface.co/Qwen/Qwen2.5-3B-Instruct","page_url":"https://unfragile.ai/qwen--qwen2.5-3b-instruct","categories":["chatbots-assistants"],"tags":["transformers","safetensors","qwen2","text-generation","chat","conversational","en","arxiv:2407.10671","base_model:Qwen/Qwen2.5-3B","base_model:finetune:Qwen/Qwen2.5-3B","license:other","text-generation-inference","endpoints_compatible","deploy:azure","region:us"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_0","uri":"capability://text.generation.language.instruction.following.conversational.text.generation","name":"instruction-following conversational text generation","description":"Generates contextually relevant, multi-turn conversational responses using a transformer-based decoder architecture fine-tuned on instruction-following datasets. The model processes input tokens through 24 transformer layers with rotary positional embeddings (RoPE) and grouped-query attention (GQA) to reduce memory footprint, enabling efficient inference on consumer hardware while maintaining coherence across extended conversations.","intents":["Build a lightweight chatbot that runs locally without cloud dependencies","Deploy a conversational assistant on edge devices with limited VRAM (4-6GB)","Create a multi-turn dialogue system that understands context and user intent","Integrate a language model into applications where latency and cost matter more than state-of-the-art accuracy"],"best_for":["Solo developers building local LLM applications","Teams deploying on-device AI without cloud infrastructure","Resource-constrained environments (mobile, embedded systems, edge servers)","Prototyping conversational features before scaling to larger models"],"limitations":["Context window limited to 32,768 tokens — cannot process documents longer than ~25,000 words without truncation","Knowledge cutoff at training time (April 2024) — no real-time information or web awareness","Instruction-following quality degrades on highly specialized domains (medical, legal, scientific) compared to 70B+ models","No native tool-calling or function-invocation support — requires prompt engineering or external orchestration","Quantization to 4-bit or 8-bit reduces quality by ~5-10% on reasoning tasks"],"requires":["Python 3.8+","PyTorch 2.0+ or compatible inference engine (vLLM, Ollama, llama.cpp)","4-6GB VRAM for fp16 inference, 2-3GB for 8-bit quantization","HuggingFace transformers library (version 4.36+)","Optional: CUDA 11.8+ for GPU acceleration, or CPU-only mode (slower)"],"input_types":["plain text (single-turn or multi-turn conversation history)","structured prompt templates with system instructions","chat message arrays with role/content pairs (OpenAI format compatible)"],"output_types":["plain text response","streaming token sequences","logits and token probabilities (for sampling control)"],"categories":["text-generation-language","conversational-ai"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_1","uri":"capability://data.processing.analysis.quantization.aware.inference.with.multiple.precision.formats","name":"quantization-aware inference with multiple precision formats","description":"Supports inference in multiple precision formats (fp16, int8, int4) through safetensors weight loading and compatibility with quantization frameworks like bitsandbytes and GPTQ. The model weights are stored in safetensors format (binary, memory-safe alternative to pickle) enabling fast loading and automatic dtype conversion, allowing developers to trade off between memory footprint and output quality based on hardware constraints.","intents":["Run the model on a 2GB VRAM GPU by applying 4-bit quantization","Load model weights safely without pickle deserialization vulnerabilities","Achieve 3-4x faster inference on CPU by using int8 quantization","Dynamically select precision at runtime based on available memory"],"best_for":["Developers deploying on resource-constrained hardware (Raspberry Pi, mobile, edge devices)","Teams requiring security-hardened model loading (safetensors prevents arbitrary code execution)","Applications where inference latency is critical and quantization tradeoffs are acceptable","Multi-tenant systems needing to fit multiple model instances in shared GPU memory"],"limitations":["4-bit quantization introduces ~3-8% accuracy degradation on factual recall and mathematical reasoning","Safetensors loading adds ~100-200ms overhead on first load (cached after initial conversion)","Quantization requires compatible inference engines — not all frameworks support all precision formats","Dynamic quantization at runtime adds ~50-100ms latency per inference call","No native support for mixed-precision inference (e.g., fp16 for attention, int8 for FFN layers)"],"requires":["bitsandbytes library (for 8-bit/4-bit quantization) or GPTQ (for pre-quantized weights)","safetensors Python library (0.3.1+)","PyTorch with quantization support (2.0+)","For 4-bit: CUDA 11.8+ or CPU mode (very slow)"],"input_types":["safetensors weight files (.safetensors)","HuggingFace model identifiers (auto-downloads and converts)","quantization configuration parameters (bits, group_size, desc_act)"],"output_types":["quantized model in memory (int4/int8 format)","dequantized logits (fp32) for output generation"],"categories":["data-processing-analysis","model-optimization"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_10","uri":"capability://automation.workflow.efficient.inference.on.consumer.hardware.with.cpu.fallback","name":"efficient inference on consumer hardware with cpu fallback","description":"Optimizes inference for consumer-grade hardware through quantization, attention optimizations (grouped-query attention), and efficient implementations that enable running on CPUs when GPUs are unavailable. The model can be deployed on laptops, edge devices, and servers without specialized hardware, with graceful degradation from GPU to CPU inference without code changes.","intents":["Run the model on a laptop without GPU for local development and testing","Deploy on edge devices (Raspberry Pi, mobile) with limited resources","Provide CPU fallback when GPU is unavailable or overloaded","Reduce infrastructure costs by avoiding GPU requirements"],"best_for":["Developers building local-first applications","Edge AI and on-device deployment","Cost-sensitive deployments where GPU ROI is low","Development environments where GPU access is limited"],"limitations":["CPU inference is 10-50x slower than GPU — inference latency increases from ~100ms to 1-5 seconds per token","CPU memory usage is higher than GPU due to lack of optimized kernels — requires 8-16GB RAM","No native support for multi-core optimization — CPU utilization may be suboptimal","Quantization is required for practical CPU inference — fp16 is too slow","Streaming is impractical on CPU due to high latency per token"],"requires":["Python 3.8+","PyTorch with CPU support (no CUDA required)","8-16GB RAM for fp16, 4-8GB for 8-bit quantization","Optional: ONNX Runtime or llama.cpp for optimized CPU inference"],"input_types":["prompt text","generation parameters"],"output_types":["generated text","optional: token probabilities"],"categories":["automation-workflow","inference-optimization"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_2","uri":"capability://text.generation.language.streaming.token.generation.with.configurable.sampling","name":"streaming token generation with configurable sampling","description":"Generates text incrementally via token-by-token streaming with support for temperature, top-k, top-p (nucleus sampling), and repetition penalty controls. The model outputs logits at each step, allowing downstream sampling strategies to be applied before token selection, enabling real-time response streaming to end-users and fine-grained control over generation diversity and coherence.","intents":["Stream chatbot responses to users in real-time instead of waiting for full generation","Implement custom sampling logic (e.g., constrained decoding, beam search) on top of raw logits","Control output diversity per-request (e.g., creative mode vs deterministic mode)","Reduce perceived latency by showing tokens as they're generated rather than batch-generating"],"best_for":["Web/mobile applications requiring real-time user feedback","Interactive chatbots where streaming improves perceived responsiveness","Research applications needing fine-grained control over sampling behavior","Systems where token-level monitoring or filtering is required"],"limitations":["Streaming adds ~5-10ms per token overhead due to I/O and serialization","Temperature and top-p sampling are applied post-generation — cannot influence model's internal attention during generation","Repetition penalty is heuristic-based and may over-suppress legitimate repeated words in lists or code","No native support for constrained decoding (e.g., JSON schema validation) — requires external grammar engines","Streaming incompatible with batch inference — must process requests sequentially"],"requires":["Inference framework supporting streaming (vLLM, text-generation-webserver, or transformers with custom generation loop)","Client capable of handling streaming responses (WebSocket, Server-Sent Events, or chunked HTTP)","Python 3.8+ with transformers library or compatible inference server"],"input_types":["prompt text with optional system instruction","generation parameters (temperature, top_k, top_p, max_tokens, repetition_penalty)","optional seed for reproducibility"],"output_types":["token stream (individual tokens as they're generated)","logits array (raw model output before sampling)","token probabilities (for uncertainty quantification)"],"categories":["text-generation-language","streaming-inference"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_3","uri":"capability://text.generation.language.multi.language.instruction.understanding.with.english.primary.training","name":"multi-language instruction understanding with english-primary training","description":"Understands and responds to instructions in multiple languages (English, Chinese, Spanish, French, German, and others) through multilingual instruction-tuning, though with English as the primary training language. The model uses a shared vocabulary across languages and learned language-agnostic instruction representations, enabling cross-lingual transfer but with degraded performance on non-English languages compared to English.","intents":["Build a chatbot that handles user queries in multiple languages without separate models","Translate instructions from non-English languages and execute them correctly","Support international users without language-specific model variants","Reduce deployment complexity by using a single model for multiple languages"],"best_for":["Global applications serving users in multiple languages","Teams without resources to maintain language-specific model variants","Use cases where English-level quality is acceptable for non-English languages","Prototyping multilingual features before investing in language-specific fine-tuning"],"limitations":["Non-English language quality is 10-20% lower than English on instruction-following tasks","Chinese and other non-Latin scripts have lower token efficiency (more tokens per semantic unit)","No language detection — requires explicit language specification or context inference","Instruction-following quality varies significantly by language (English > Chinese > Spanish > others)","No native support for code-switching (mixing languages in single prompt)"],"requires":["Python 3.8+","Transformers library with multilingual tokenizer support","No additional language packs or dependencies"],"input_types":["text in any supported language","mixed-language prompts (though not optimized)","language-tagged instructions (optional, for explicit language specification)"],"output_types":["text in the same language as input (usually)","code or structured output (language-independent)"],"categories":["text-generation-language","multilingual-support"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_4","uri":"capability://text.generation.language.system.prompt.and.role.based.instruction.injection","name":"system prompt and role-based instruction injection","description":"Accepts system prompts and role definitions that shape model behavior without fine-tuning, using a chat template that separates system instructions from user messages and model responses. The model processes the system prompt as context that influences all subsequent generations in a conversation, enabling dynamic behavior modification (e.g., 'act as a Python expert', 'respond in JSON format') without retraining.","intents":["Define a chatbot's personality or expertise area via system prompt (e.g., 'You are a helpful coding assistant')","Enforce output format constraints (e.g., 'Always respond in JSON') without grammar-based decoding","Create role-specific variants of the same model (customer support, technical support, creative writing)","Implement guardrails by instructing the model to refuse certain requests"],"best_for":["Multi-purpose chatbot platforms needing dynamic role switching","Applications requiring lightweight behavior customization without fine-tuning","Teams building prompt-based systems where system prompts are part of the product","Rapid prototyping of specialized assistants"],"limitations":["System prompt effectiveness depends on prompt engineering skill — no guarantee of compliance","Model may ignore or misinterpret system prompts if they conflict with training objectives","No native enforcement mechanism — model can violate system prompt instructions if incentivized by user input","System prompt tokens count against context window — long system prompts reduce available space for conversation history","Jailbreaking via user input can override system prompt constraints"],"requires":["Understanding of chat template format (role/content pairs)","Prompt engineering knowledge for effective system prompt design","Inference framework supporting multi-turn chat (transformers, vLLM, etc.)"],"input_types":["system prompt (string)","user message (string)","optional conversation history (array of role/content pairs)"],"output_types":["model response (string)","structured output if requested in system prompt (JSON, YAML, etc.)"],"categories":["text-generation-language","prompt-engineering"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_5","uri":"capability://text.generation.language.context.aware.response.generation.with.32k.token.window","name":"context-aware response generation with 32k token window","description":"Maintains conversation context across up to 32,768 tokens (~25,000 words) using rotary positional embeddings (RoPE) that enable efficient long-context attention without quadratic memory scaling. The model can reference earlier messages in a conversation, retrieve relevant context from long documents, and generate coherent responses that depend on distant context, enabling multi-turn conversations and document-based Q&A without context truncation.","intents":["Build a chatbot that remembers and references earlier messages in long conversations","Implement document-based Q&A where the model answers questions about documents up to 25K words","Create a code review assistant that analyzes entire files or multiple related files","Support extended conversations without losing context or requiring manual summarization"],"best_for":["Document analysis and Q&A applications","Long-form conversational systems","Code analysis and review tools","Research assistants that need to maintain context across extended interactions"],"limitations":["32K token limit is insufficient for very large documents (e.g., entire books, large codebases) — requires chunking or summarization","Attention quality degrades slightly at the edges of the context window (first and last ~1K tokens)","Long context increases inference latency — processing 32K tokens takes ~5-10x longer than 4K tokens","Memory usage scales linearly with context length — 32K tokens requires ~8-12GB VRAM in fp16","No native support for hierarchical or sparse attention — all tokens attend to all other tokens"],"requires":["Inference framework supporting long-context attention (vLLM, text-generation-webserver, or transformers with custom attention)","8-12GB VRAM for fp16 inference with full 32K context","4-6GB VRAM for 8-bit quantization with full context"],"input_types":["conversation history (array of messages)","document text (up to 32K tokens)","user query or instruction"],"output_types":["contextually relevant response","citations or references to source context (if prompted)"],"categories":["text-generation-language","context-management"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_6","uri":"capability://code.generation.editing.code.aware.text.generation.with.programming.language.understanding","name":"code-aware text generation with programming language understanding","description":"Generates syntactically correct code across multiple programming languages (Python, JavaScript, Java, C++, SQL, etc.) through instruction-tuning on code datasets and code-specific training objectives. The model learns language-specific syntax, idioms, and common patterns, enabling it to complete code snippets, generate functions, and explain code without requiring external linters or syntax validators.","intents":["Generate Python functions or scripts from natural language descriptions","Complete code snippets in multiple languages with correct syntax","Explain existing code or translate code between languages","Generate SQL queries from natural language descriptions"],"best_for":["Code completion and generation tools","Developer assistants and pair programming applications","Educational tools for learning programming","Rapid prototyping and scaffolding"],"limitations":["Code generation quality varies by language — Python and JavaScript are better supported than niche languages","Generated code may have logical errors or inefficiencies despite syntactic correctness","No native understanding of project structure or dependencies — cannot reference external libraries without explicit context","Hallucination risk is higher for code than text — model may invent function names or library APIs that don't exist","No built-in testing or validation — generated code requires manual review and testing"],"requires":["Python 3.8+","Transformers library","Optional: linting tools (pylint, eslint) for post-generation validation"],"input_types":["natural language description of desired code","code snippet to complete or refactor","programming language specification","optional context (imports, function signatures, etc.)"],"output_types":["code snippet or function","complete script or module","code explanation or documentation"],"categories":["code-generation-editing","programming-assistance"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_7","uri":"capability://text.generation.language.few.shot.learning.via.in.context.examples","name":"few-shot learning via in-context examples","description":"Learns new tasks from a small number of examples provided in the prompt (few-shot learning) without fine-tuning, using the model's learned ability to recognize patterns and generalize from examples. By including 1-5 examples of input-output pairs in the prompt, developers can guide the model to perform new tasks (e.g., sentiment classification, entity extraction, format conversion) without retraining.","intents":["Perform sentiment classification by providing 2-3 labeled examples in the prompt","Extract structured data (entities, relationships) from text using example-based patterns","Convert text between formats (e.g., CSV to JSON) by showing one or two examples","Implement custom classification or extraction tasks without fine-tuning"],"best_for":["Rapid prototyping of NLP tasks without labeled training data","One-off or low-volume tasks where fine-tuning ROI is low","Applications requiring dynamic task switching without model reloading","Researchers exploring model capabilities without training infrastructure"],"limitations":["Few-shot performance is highly sensitive to example quality and ordering — poor examples degrade accuracy significantly","Performance plateaus with 5-10 examples — adding more examples doesn't improve accuracy and wastes context","No guarantee of consistency — model may produce different outputs for semantically identical inputs","Requires manual example curation — no automated way to select optimal examples","Context window is shared between examples and actual task — long examples reduce space for input data"],"requires":["Understanding of prompt engineering and example selection","Ability to format examples clearly and consistently","No additional training data or infrastructure"],"input_types":["prompt with 1-5 examples (input-output pairs)","new input to apply the learned pattern to"],"output_types":["output following the pattern demonstrated by examples","structured data if examples show structured format"],"categories":["text-generation-language","prompt-engineering"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_8","uri":"capability://automation.workflow.batch.inference.with.dynamic.batching.for.throughput.optimization","name":"batch inference with dynamic batching for throughput optimization","description":"Processes multiple requests simultaneously through dynamic batching, where requests of different lengths are grouped together and padded to the same length for efficient GPU utilization. The inference engine (vLLM, text-generation-webserver) schedules requests to maximize GPU occupancy while respecting latency constraints, enabling high throughput on shared hardware without sacrificing per-request latency.","intents":["Process hundreds of inference requests per second on a single GPU","Maximize GPU utilization by batching requests of varying lengths","Serve multiple users concurrently without significant latency increase","Reduce per-request inference cost in production by amortizing GPU overhead"],"best_for":["Production API servers handling multiple concurrent requests","Batch processing pipelines (e.g., classifying thousands of documents)","Multi-tenant systems with variable request patterns","Cost-sensitive deployments where throughput matters more than latency"],"limitations":["Dynamic batching adds 10-50ms latency due to scheduling overhead and request queueing","Batching is ineffective for streaming responses — streaming requests cannot be batched","Memory usage scales with batch size — larger batches require more VRAM","Padding tokens in shorter sequences waste computation — effective throughput depends on length distribution","No native support for priority queuing — all requests are treated equally"],"requires":["Inference framework supporting dynamic batching (vLLM, text-generation-webserver, TensorRT-LLM)","GPU with sufficient VRAM for batch size (8-16GB for batch size 32-64)","Load balancer or request queue for managing incoming requests"],"input_types":["multiple prompts (array of strings)","batch size configuration","optional: request priorities or deadlines"],"output_types":["array of generated responses","per-request metadata (tokens generated, latency, etc.)"],"categories":["automation-workflow","inference-optimization"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__cap_9","uri":"capability://safety.moderation.safety.aligned.response.generation.with.refusal.capabilities","name":"safety-aligned response generation with refusal capabilities","description":"Generates responses that align with safety guidelines through instruction-tuning on safety-focused datasets, including the ability to recognize and refuse harmful requests (e.g., illegal activities, violence, abuse). The model learns to identify unsafe requests and respond with explanations of why it cannot fulfill them, without requiring external content filters or guardrails.","intents":["Deploy a chatbot that refuses harmful requests without external moderation","Reduce moderation costs by having the model self-filter unsafe content","Ensure compliance with safety policies without manual review","Provide transparent refusals that explain why a request cannot be fulfilled"],"best_for":["Public-facing chatbots requiring safety alignment","Applications subject to content moderation regulations","Teams without dedicated moderation infrastructure","Prototyping safe AI systems before deploying to production"],"limitations":["Safety alignment is not foolproof — adversarial prompts can sometimes bypass refusals (jailbreaking)","Refusal behavior is learned from training data — edge cases may not be handled correctly","Over-refusal risk — model may refuse legitimate requests if they're phrased ambiguously","No transparency into safety decision-making — difficult to debug why specific requests are refused","Safety alignment may reduce helpfulness on borderline requests (e.g., discussing sensitive topics for educational purposes)"],"requires":["No additional dependencies — safety alignment is built into the model","Optional: external content filters for additional safety layers"],"input_types":["user prompt (any content)","optional: safety policy specification"],"output_types":["safe response or refusal explanation","optional: confidence score for safety decision"],"categories":["safety-moderation","content-filtering"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-qwen--qwen2.5-3b-instruct__headline","uri":"capability://text.generation.language.ai.text.generation.model.for.chatbots.and.assistants","name":"ai text generation model for chatbots and assistants","description":"Qwen2.5-3B-Instruct is an advanced AI text generation model designed specifically for creating conversational agents and chatbots, enabling natural interactions in various languages.","intents":["best AI text generation model","text generation for chatbots","top conversational AI models","AI assistant for natural language processing","best models for chatbot development"],"best_for":["chatbots","conversational applications"],"limitations":[],"requires":[],"input_types":["text"],"output_types":["text"],"categories":["text-generation-language"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":54,"verified":false,"data_access_risk":"low","permissions":["Python 3.8+","PyTorch 2.0+ or compatible inference engine (vLLM, Ollama, llama.cpp)","4-6GB VRAM for fp16 inference, 2-3GB for 8-bit quantization","HuggingFace transformers library (version 4.36+)","Optional: CUDA 11.8+ for GPU acceleration, or CPU-only mode (slower)","bitsandbytes library (for 8-bit/4-bit quantization) or GPTQ (for pre-quantized weights)","safetensors Python library (0.3.1+)","PyTorch with quantization support (2.0+)","For 4-bit: CUDA 11.8+ or CPU mode (very slow)","PyTorch with CPU support (no CUDA required)"],"failure_modes":["Context window limited to 32,768 tokens — cannot process documents longer than ~25,000 words without truncation","Knowledge cutoff at training time (April 2024) — no real-time information or web awareness","Instruction-following quality degrades on highly specialized domains (medical, legal, scientific) compared to 70B+ models","No native tool-calling or function-invocation support — requires prompt engineering or external orchestration","Quantization to 4-bit or 8-bit reduces quality by ~5-10% on reasoning tasks","4-bit quantization introduces ~3-8% accuracy degradation on factual recall and mathematical reasoning","Safetensors loading adds ~100-200ms overhead on first load (cached after initial conversion)","Quantization requires compatible inference engines — not all frameworks support all precision formats","Dynamic quantization at runtime adds ~50-100ms latency per inference call","No native support for mixed-precision inference (e.g., fp16 for attention, int8 for FFN layers)","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.8919784016043621,"quality":0.32,"ecosystem":0.5000000000000001,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.35,"quality":0.2,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.765Z","last_scraped_at":"2026-05-03T14:22:48.039Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":9207977,"model_likes":450}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=qwen--qwen2.5-3b-instruct","compare_url":"https://unfragile.ai/compare?artifact=qwen--qwen2.5-3b-instruct"}},"signature":"0BSYwHqHFww+DurLsViKEc6ONf8yjGxxVYGAlqnQnjjd6haZ5YuQBa0slYC2SNq9CgX7q2kdscAUAjXqSpUaAA==","signedAt":"2026-06-21T08:52:52.097Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/qwen--qwen2.5-3b-instruct","artifact":"https://unfragile.ai/qwen--qwen2.5-3b-instruct","verify":"https://unfragile.ai/api/v1/verify?slug=qwen--qwen2.5-3b-instruct","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}