{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"hf-model-tomaarsen--span-marker-mbert-base-multinerd","slug":"tomaarsen--span-marker-mbert-base-multinerd","name":"span-marker-mbert-base-multinerd","type":"model","url":"https://huggingface.co/tomaarsen/span-marker-mbert-base-multinerd","page_url":"https://unfragile.ai/tomaarsen--span-marker-mbert-base-multinerd","categories":["model-training"],"tags":["span-marker","pytorch","tensorboard","safetensors","token-classification","ner","named-entity-recognition","multilingual","dataset:Babelscape/multinerd","base_model:google-bert/bert-base-multilingual-cased","base_model:finetune:google-bert/bert-base-multilingual-cased","license:cc-by-nc-sa-4.0","model-index","region:us"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"hf-model-tomaarsen--span-marker-mbert-base-multinerd__cap_0","uri":"capability://data.processing.analysis.multilingual.named.entity.recognition.with.span.based.token.classification","name":"multilingual named entity recognition with span-based token classification","description":"Performs token-level classification using a span-marker architecture built on mBERT (multilingual BERT), enabling detection and classification of named entities across 10+ languages simultaneously. The model uses a two-stage span-based approach: first identifying entity boundaries via token classification, then assigning entity type labels to detected spans. This differs from traditional sequence labeling by operating on variable-length spans rather than individual tokens, reducing cascading errors from boundary misalignment.","intents":["extract person names, organizations, locations, and other entity types from multilingual text documents","build NER pipelines that work across multiple languages without language-specific retraining","identify fine-grained entity categories (e.g., distinguishing between person names and product names) in unstructured text","process documents containing mixed-language content with a single model"],"best_for":["NLP teams building multilingual information extraction systems","developers creating document processing pipelines for international content","researchers working with low-resource languages covered by mBERT","organizations needing entity recognition without language-specific model management"],"limitations":["Trained only on MultiNERD dataset — may not recognize domain-specific entities (medical, legal, financial terminology) outside training distribution","mBERT base model has 110M parameters, requiring ~500MB GPU memory; slower inference than distilled alternatives (50-100ms per document on CPU)","Span-marker approach assumes entities are contiguous sequences; cannot handle discontinuous or overlapping entity mentions","Performance degrades on languages with limited mBERT pretraining data (e.g., low-resource African languages); best performance on high-resource languages (English, Chinese, Spanish, German)"],"requires":["PyTorch 1.9+ or TensorFlow 2.4+","Transformers library 4.25+","Python 3.7+","~1GB disk space for model weights (safetensors format)","GPU with 2GB+ VRAM recommended for batch inference; CPU inference supported but slower"],"input_types":["raw text strings","tokenized sequences (pre-tokenized input)","documents up to 512 tokens (mBERT context window)"],"output_types":["token-level classification labels (IOB2 or BIOES format)","span boundaries with entity type labels","confidence scores per entity","structured JSON with entity offsets and types"],"categories":["data-processing-analysis","token-classification"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-tomaarsen--span-marker-mbert-base-multinerd__cap_1","uri":"capability://data.processing.analysis.cross.lingual.entity.type.classification.with.shared.embedding.space","name":"cross-lingual entity type classification with shared embedding space","description":"Leverages mBERT's multilingual embedding space to classify entity types consistently across languages without language-specific fine-tuning. The model encodes text through mBERT's 12 transformer layers, projecting tokens into a shared 768-dimensional space where entity semantics align across languages. This enables zero-shot or few-shot entity classification for languages not explicitly seen during training, as long as they're covered by mBERT's 104-language pretraining.","intents":["classify entities in languages not explicitly in the training set using cross-lingual transfer","reduce annotation burden by training on high-resource languages and applying to low-resource languages","build entity classification systems that generalize across language families (e.g., Romance, Germanic, Slavic)"],"best_for":["multilingual NLP teams with limited annotation budgets for low-resource languages","organizations processing documents in 50+ languages with a single model","researchers studying cross-lingual transfer learning in NER tasks"],"limitations":["Cross-lingual transfer quality depends on mBERT's pretraining coverage; languages with minimal Wikipedia representation (e.g., minority languages) see 10-20% accuracy drops","Entity types must be semantically similar across languages; culturally-specific entity categories may not transfer well","No explicit language identification — model assumes input is valid text in a supported language"],"requires":["mBERT tokenizer (included in transformers library)","Understanding of target language's script and tokenization conventions","Multilingual training data or validation set to measure cross-lingual performance"],"input_types":["text in any of 104 languages supported by mBERT","mixed-language documents (model processes each token independently)"],"output_types":["entity type labels consistent across languages","confidence scores reflecting cross-lingual alignment"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-tomaarsen--span-marker-mbert-base-multinerd__cap_2","uri":"capability://data.processing.analysis.fine.grained.entity.type.disambiguation.with.10.entity.categories","name":"fine-grained entity type disambiguation with 10+ entity categories","description":"Classifies detected entities into 10+ distinct entity types (person, organization, location, product, event, etc.) as defined by the MultiNERD dataset, enabling fine-grained information extraction beyond simple binary entity/non-entity classification. The model learns type-specific patterns through supervised training on MultiNERD's annotated corpus, using mBERT's contextual representations to disambiguate entities with identical surface forms but different types (e.g., 'Apple' as company vs. fruit).","intents":["extract and categorize entities into specific types for downstream knowledge graph construction","disambiguate entities with multiple possible types based on context","build entity-centric search and recommendation systems with type-aware filtering"],"best_for":["information extraction pipelines requiring structured entity type labels","knowledge graph construction systems needing entity type classification","semantic search systems that filter by entity type"],"limitations":["Entity types are fixed to MultiNERD's taxonomy (10+ types); custom entity types require model retraining","Accuracy varies by entity type; rare types (e.g., 'event', 'product') have lower F1 scores (~70-75%) compared to common types like 'person' (~90%)","Requires sufficient context (typically 3-5 surrounding tokens) for accurate type disambiguation; isolated entity mentions may be misclassified"],"requires":["MultiNERD entity type taxonomy or custom mapping to model's output classes","Sufficient context around entities (model uses 512-token window)"],"input_types":["text with entity mentions in context"],"output_types":["entity type labels from MultiNERD taxonomy","confidence scores per entity type","structured output with entity text, type, and span offsets"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-tomaarsen--span-marker-mbert-base-multinerd__cap_3","uri":"capability://data.processing.analysis.batch.entity.extraction.with.efficient.span.enumeration","name":"batch entity extraction with efficient span enumeration","description":"Processes multiple documents or long documents through efficient span enumeration, where the model identifies all possible entity spans (up to a configurable maximum length, typically 8-10 tokens) and classifies each span's entity type. This approach avoids redundant token-level computations by leveraging mBERT's contextual representations across the entire document, then scoring spans post-hoc. Batch processing is optimized through padding and masking to handle variable-length inputs efficiently.","intents":["extract entities from large document collections with minimal latency overhead","process documents longer than 512 tokens by sliding window or chunking strategies","optimize inference throughput for production NER pipelines handling thousands of documents daily"],"best_for":["production NER systems processing high-volume document streams","batch processing pipelines for historical document archives","real-time entity extraction services with latency constraints (<100ms per document)"],"limitations":["Span enumeration has quadratic complexity in document length; documents >512 tokens require chunking or sliding window, introducing boundary artifacts","Maximum span length is fixed (typically 8-10 tokens); entities longer than this are missed or fragmented","Batch processing requires padding to common length, wasting computation on short documents; dynamic batching adds complexity"],"requires":["Batch processing framework (PyTorch DataLoader, TensorFlow Dataset, or equivalent)","GPU or multi-core CPU for parallel inference","Memory for storing intermediate span representations (~100MB for 1000-document batches)"],"input_types":["lists of text documents","pre-tokenized document batches"],"output_types":["batch entity predictions with document-level offsets","confidence scores per entity","structured JSON with document ID, entity text, type, and span offsets"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-tomaarsen--span-marker-mbert-base-multinerd__cap_4","uri":"capability://memory.knowledge.contextual.entity.representation.extraction.for.downstream.tasks","name":"contextual entity representation extraction for downstream tasks","description":"Exposes mBERT's intermediate layer representations (768-dimensional contextual embeddings) for each detected entity span, enabling downstream tasks like entity linking, coreference resolution, or entity similarity matching. The model outputs not just entity type labels but also the pooled contextual representation of each entity span, computed by averaging mBERT's hidden states across the span's tokens. These representations capture semantic and syntactic context, enabling vector-based entity operations.","intents":["extract entity embeddings for entity linking to knowledge bases (e.g., Wikidata)","compute entity similarity for coreference resolution or duplicate detection","build entity-aware semantic search by indexing entity embeddings in vector databases"],"best_for":["knowledge graph construction pipelines requiring entity linking","coreference resolution systems using entity embeddings","entity-centric semantic search and recommendation systems"],"limitations":["Entity embeddings are context-dependent; same entity mention in different contexts produces different embeddings, requiring careful handling in entity linking","768-dimensional embeddings require vector database infrastructure (e.g., Faiss, Pinecone) for efficient similarity search at scale","Embedding quality depends on mBERT's pretraining; domain-specific entities may have poor representations"],"requires":["Vector database or similarity search library (Faiss, Annoy, Pinecone, etc.)","Entity linking knowledge base (e.g., Wikidata, DBpedia) with embeddings for linking","Storage for 768-dimensional vectors (~3KB per entity)"],"input_types":["text with entity mentions"],"output_types":["768-dimensional entity embeddings (float32)","entity type labels","span offsets and confidence scores"],"categories":["memory-knowledge","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-tomaarsen--span-marker-mbert-base-multinerd__cap_5","uri":"capability://automation.workflow.safetensors.model.serialization.for.secure.and.efficient.model.loading","name":"safetensors model serialization for secure and efficient model loading","description":"Uses safetensors format for model weights instead of traditional PyTorch pickle format, enabling faster model loading, reduced memory overhead, and protection against arbitrary code execution during deserialization. Safetensors is a binary format that stores tensor data with explicit type and shape information, allowing zero-copy memory mapping on compatible systems. The model is distributed as a single safetensors file, eliminating the need for separate config and weight files.","intents":["load model weights quickly in production environments with minimal startup latency","safely load model weights without risk of arbitrary code execution from untrusted sources","reduce model storage footprint and download bandwidth through efficient binary serialization"],"best_for":["production systems requiring fast model initialization (<1 second)","containerized deployments with strict security requirements","edge devices with limited storage and bandwidth"],"limitations":["Safetensors support requires transformers library 4.25+; older versions require conversion to PyTorch format","Memory mapping benefits only apply on systems with sufficient virtual memory; embedded systems may not benefit","Safetensors format is immutable; model quantization or pruning requires conversion back to PyTorch format"],"requires":["transformers library 4.25+","safetensors library (installed automatically with transformers)","~500MB disk space for model weights"],"input_types":["safetensors binary files"],"output_types":["loaded PyTorch model state dict","model ready for inference"],"categories":["automation-workflow","safety-moderation"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"hf-model-tomaarsen--span-marker-mbert-base-multinerd__cap_6","uri":"capability://data.processing.analysis.multilingual.tokenization.with.mbert.s.shared.vocabulary","name":"multilingual tokenization with mbert's shared vocabulary","description":"Leverages mBERT's 119K shared vocabulary across 104 languages, enabling consistent tokenization of multilingual text without language-specific tokenizers. The WordPiece tokenizer handles subword segmentation for out-of-vocabulary words, preserving morphological information across languages. This unified tokenization approach ensures that entities in different languages are represented in a shared token space, enabling the span-marker model to apply consistent entity classification rules across languages.","intents":["tokenize multilingual documents with a single tokenizer without language detection or switching","handle code-mixed text (e.g., English-Spanish) with consistent subword segmentation","ensure entity boundaries align with tokenization across languages"],"best_for":["multilingual NLP pipelines avoiding language-specific tokenizer management","systems processing code-mixed or transliterated text","organizations standardizing on a single tokenization scheme across languages"],"limitations":["Shared vocabulary may be suboptimal for individual languages; language-specific tokenizers (e.g., SentencePiece for CJK languages) often achieve better compression","Tokenization quality degrades for languages with limited mBERT pretraining data; rare scripts may produce excessive subword fragmentation","119K vocabulary size limits expressiveness compared to larger models (e.g., GPT-3's 50K tokens); rare words are heavily subword-segmented"],"requires":["mBERT tokenizer (included in transformers library)","Input text in valid Unicode encoding (UTF-8 recommended)"],"input_types":["raw text in any of 104 supported languages","code-mixed text combining multiple languages"],"output_types":["token IDs (integers 0-119K)","token strings","attention masks and token type IDs for model input"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":45,"verified":false,"data_access_risk":"high","permissions":["PyTorch 1.9+ or TensorFlow 2.4+","Transformers library 4.25+","Python 3.7+","~1GB disk space for model weights (safetensors format)","GPU with 2GB+ VRAM recommended for batch inference; CPU inference supported but slower","mBERT tokenizer (included in transformers library)","Understanding of target language's script and tokenization conventions","Multilingual training data or validation set to measure cross-lingual performance","MultiNERD entity type taxonomy or custom mapping to model's output classes","Sufficient context around entities (model uses 512-token window)"],"failure_modes":["Trained only on MultiNERD dataset — may not recognize domain-specific entities (medical, legal, financial terminology) outside training distribution","mBERT base model has 110M parameters, requiring ~500MB GPU memory; slower inference than distilled alternatives (50-100ms per document on CPU)","Span-marker approach assumes entities are contiguous sequences; cannot handle discontinuous or overlapping entity mentions","Performance degrades on languages with limited mBERT pretraining data (e.g., low-resource African languages); best performance on high-resource languages (English, Chinese, Spanish, German)","Cross-lingual transfer quality depends on mBERT's pretraining coverage; languages with minimal Wikipedia representation (e.g., minority languages) see 10-20% accuracy drops","Entity types must be semantically similar across languages; culturally-specific entity categories may not transfer well","No explicit language identification — model assumes input is valid text in a supported language","Entity types are fixed to MultiNERD's taxonomy (10+ types); custom entity types require model retraining","Accuracy varies by entity type; rare types (e.g., 'event', 'product') have lower F1 scores (~70-75%) compared to common types like 'person' (~90%)","Requires sufficient context (typically 3-5 surrounding tokens) for accurate type disambiguation; isolated entity mentions may be misclassified","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.6010476020646116,"quality":0.39,"ecosystem":0.5000000000000001,"match_graph":0.25,"freshness":0.75,"weights":{"adoption":0.35,"quality":0.2,"ecosystem":0.1,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:22.766Z","last_scraped_at":"2026-05-03T14:23:01.785Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":249148,"model_likes":71}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=tomaarsen--span-marker-mbert-base-multinerd","compare_url":"https://unfragile.ai/compare?artifact=tomaarsen--span-marker-mbert-base-multinerd"}},"signature":"RX4Ru4V7VlBuBlQ3aIQJhSIPyB59F5Ag9IEl3dfFXfbT6O+7bVxuga7qXnMmgxiNJwpPmZvj3x472vJE27TzCQ==","signedAt":"2026-06-21T00:56:07.285Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/tomaarsen--span-marker-mbert-base-multinerd","artifact":"https://unfragile.ai/tomaarsen--span-marker-mbert-base-multinerd","verify":"https://unfragile.ai/api/v1/verify?slug=tomaarsen--span-marker-mbert-base-multinerd","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}