{"passport":{"unfragile":{"@version":"1.0","version":"2026-05","artifact":{"id":"pypi_pypi-gensim","slug":"pypi-gensim","name":"gensim","type":"repo","url":"https://radimrehurek.com/gensim/","page_url":"https://unfragile.ai/pypi-gensim","categories":["frameworks-sdks","rag-knowledge"],"tags":["Singular","Value","Decomposition","SVD","Latent","Semantic","Indexing","LSA","LSI","Latent","Dirichlet","Allocation","LDA","Hierarchical","Dirichlet","Process","HDP","Random","Projections","TFIDF"],"pricing":{"model":"open_source","free":true,"starting_price":null},"status":"active","verified":false},"capabilities":[{"id":"pypi_pypi-gensim__cap_0","uri":"capability://data.processing.analysis.latent.semantic.indexing.lsi.with.svd.decomposition","name":"latent semantic indexing (lsi) with svd decomposition","description":"Decomposes document-term matrices using Singular Value Decomposition to discover latent semantic relationships between documents and terms. Gensim implements sparse SVD via ARPACK, reducing dimensionality while preserving semantic structure, enabling semantic search and document similarity without explicit keyword matching. The implementation handles large sparse matrices efficiently through iterative algorithms rather than dense matrix operations.","intents":["I need to find semantically similar documents without exact keyword matches","I want to reduce noise in document collections by discovering underlying semantic topics","I need to perform semantic search across a corpus with automatic dimensionality reduction"],"best_for":["Information retrieval engineers building semantic search systems","NLP researchers exploring latent semantic analysis","Teams processing document collections with limited computational resources"],"limitations":["SVD computation scales O(n²) with vocabulary size; becomes slow beyond 100k+ unique terms","Requires dense matrix operations for final similarity computation despite sparse input","No incremental updates — must recompute entire decomposition when corpus changes","Semantic quality degrades with very short documents or sparse term distributions"],"requires":["Python 2.7+ or 3.5+","NumPy and SciPy for linear algebra operations","ARPACK library (typically bundled with SciPy)","Corpus with at least 100+ documents for meaningful semantic discovery"],"input_types":["document-term matrix (sparse or dense)","bag-of-words representation","TF-IDF weighted vectors"],"output_types":["low-rank semantic space representation","document-topic vectors","similarity scores between documents"],"categories":["data-processing-analysis","search-retrieval"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_1","uri":"capability://data.processing.analysis.latent.dirichlet.allocation.lda.topic.modeling","name":"latent dirichlet allocation (lda) topic modeling","description":"Probabilistic generative model that discovers latent topics in document collections using variational inference or Gibbs sampling. Gensim implements online LDA with mini-batch updates, allowing incremental model training on streaming data without reprocessing the entire corpus. The model learns per-document topic distributions and per-topic word distributions through iterative Bayesian inference, enabling dynamic topic discovery as new documents arrive.","intents":["I need to automatically discover hidden topics in a large document collection","I want to update my topic model incrementally as new documents arrive without full retraining","I need to infer topic distributions for new unseen documents using a trained model"],"best_for":["Content teams analyzing document collections for thematic structure","Researchers in computational linguistics and NLP","Systems requiring incremental model updates with streaming document ingestion"],"limitations":["Requires manual tuning of number of topics — no automatic selection mechanism","Convergence is slow for large vocabularies (100k+ terms); typically requires 10-50 passes","Topic interpretability depends heavily on preprocessing quality; garbage input produces garbage topics","Gibbs sampling variant is single-threaded; variational inference is slower than modern neural alternatives","No built-in coherence evaluation — requires external metrics to assess topic quality"],"requires":["Python 2.7+ or 3.5+","NumPy for numerical operations","Corpus with minimum 50+ documents for stable topic discovery","Preprocessed text (tokenization, stopword removal, lowercasing)"],"input_types":["bag-of-words corpus","document-term matrix","token streams"],"output_types":["topic-word distributions (per-topic vocabulary)","document-topic distributions","topic assignments for individual tokens"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_10","uri":"capability://automation.workflow.model.persistence.and.serialization","name":"model persistence and serialization","description":"Provides serialization and deserialization of trained models (embeddings, topic models, transformations) to disk for reproducibility and production deployment. Gensim implements model saving through pickle and custom binary formats, enabling models to be trained once and reused across multiple applications without retraining. The serialization preserves all learned parameters and statistics, enabling deterministic inference on new data.","intents":["I need to save trained models and reuse them in production without retraining","I want to share trained models with team members or deploy them to different systems","I need to version control models and track which model version produced specific results"],"best_for":["Production NLP systems requiring model versioning and deployment","Teams sharing trained models across development and production environments","Researchers publishing reproducible results with trained model artifacts"],"limitations":["Pickle format is Python-specific; models cannot be loaded in other languages without custom deserialization","Model files can be large (100MB+ for large embeddings); no built-in compression","No versioning mechanism; breaking changes in Gensim versions can cause deserialization failures","Binary format is not human-readable; difficult to inspect or debug model contents","No incremental model updates; must reload entire model to add new data"],"requires":["Python 2.7+ or 3.5+","File system access for reading/writing model files","Same Gensim version for serialization and deserialization (or compatible versions)"],"input_types":["trained Gensim models"],"output_types":["serialized model files (pickle or binary format)","loaded model objects"],"categories":["automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_11","uri":"capability://data.processing.analysis.corpus.statistics.and.vocabulary.analysis","name":"corpus statistics and vocabulary analysis","description":"Computes and tracks corpus-level statistics including document frequencies, term frequencies, vocabulary size, and term co-occurrence patterns. Gensim's Dictionary class maintains these statistics during corpus iteration, enabling analysis of vocabulary properties without materializing the full corpus. Statistics are used by downstream models (TF-IDF, LDA) to learn appropriate weighting and prior parameters.","intents":["I need to understand vocabulary properties and term distributions in my corpus","I want to filter rare or common terms based on frequency thresholds","I need to compute corpus statistics for model initialization and hyperparameter tuning"],"best_for":["NLP practitioners performing exploratory data analysis on text collections","Researchers studying vocabulary properties and term distributions","Teams tuning model hyperparameters based on corpus characteristics"],"limitations":["Statistics are computed during corpus iteration; no random access to per-document statistics","Frequency thresholds are global; no support for document-specific filtering","Co-occurrence statistics require explicit computation; not automatically tracked","Large vocabularies (1M+ terms) consume significant memory for frequency tracking","No built-in visualization tools; statistics must be exported for external analysis"],"requires":["Python 2.7+ or 3.5+","Corpus data source","Dictionary object"],"input_types":["tokenized corpus","bag-of-words corpus"],"output_types":["vocabulary statistics","term frequency distributions","document frequency counts"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_12","uri":"capability://data.processing.analysis.gensim.specific.corpus.format.support.mmcorpus.svmlightcorpus","name":"gensim-specific corpus format support (mmcorpus, svmlightcorpus)","description":"Provides native support for reading and writing corpus data in Gensim-optimized formats (Matrix Market, SVMLight) that enable efficient storage and retrieval of sparse document-term matrices. These formats store only non-zero entries, reducing disk space and I/O overhead compared to dense formats. Gensim's corpus readers integrate with the corpus abstraction, enabling seamless iteration over files in these formats.","intents":["I need to store large sparse document-term matrices efficiently on disk","I want to share corpus data in a standard format that other tools can read","I need to load pre-existing corpus files in Matrix Market or SVMLight format"],"best_for":["Teams managing large corpus files requiring efficient storage","Researchers sharing corpus data in standard formats","Systems requiring interoperability with SVMLight-based machine learning tools"],"limitations":["Matrix Market format is text-based; slower I/O than binary formats for large corpora","SVMLight format stores only feature-value pairs; metadata and document IDs must be managed separately","No compression support; files can be large despite sparse representation","Format conversion requires explicit corpus iteration; no in-place format conversion","Limited adoption outside Gensim ecosystem; fewer tools support these formats compared to CSV/JSON"],"requires":["Python 2.7+ or 3.5+","Corpus files in Matrix Market or SVMLight format"],"input_types":["Matrix Market files (.mm)","SVMLight files (.svmlight)"],"output_types":["Gensim corpus objects","sparse matrices"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_13","uri":"capability://search.retrieval.similarity.indexing.and.approximate.nearest.neighbor.search","name":"similarity indexing and approximate nearest neighbor search","description":"Provides optional similarity indexing through sparse matrix structures and integration with approximate nearest neighbor libraries (Annoy, FAISS) for efficient similarity queries on large corpora. Gensim's SparseMatrixSimilarity class enables fast similarity computation through sparse matrix multiplication, while optional indexing backends enable sublinear-time nearest neighbor search. This enables semantic search and recommendation systems to scale to millions of documents.","intents":["I need to find the k most similar documents to a query efficiently on a large corpus","I want to scale semantic search to millions of documents without exhaustive comparison","I need to build recommendation systems that rank documents by semantic relevance"],"best_for":["Search and recommendation systems at scale (100k+ documents)","Teams building semantic similarity services with latency requirements","Researchers studying approximate nearest neighbor algorithms"],"limitations":["Sparse matrix indexing is still O(n) for exhaustive search; approximation requires external libraries","Approximate nearest neighbor search trades recall for speed; may miss relevant documents","Index construction is expensive; must rebuild when corpus changes significantly","Annoy/FAISS integration requires additional dependencies and custom implementation","No built-in index updates; incremental corpus changes require full index reconstruction"],"requires":["Python 2.7+ or 3.5+","NumPy and SciPy for sparse matrix operations","Optional: Annoy or FAISS library for approximate nearest neighbor search","Trained model producing vector representations"],"input_types":["document vectors","query vectors","sparse matrices"],"output_types":["k-nearest neighbor lists","ranked similarity scores","document indices"],"categories":["search-retrieval","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_2","uri":"capability://data.processing.analysis.hierarchical.dirichlet.process.hdp.topic.modeling","name":"hierarchical dirichlet process (hdp) topic modeling","description":"Non-parametric Bayesian topic model that automatically infers the optimal number of topics without manual specification, using a hierarchical Dirichlet process prior. Gensim implements HDP via variational inference, discovering topic hierarchies and sharing statistical strength across topics through the DP structure. Unlike LDA, HDP can grow the topic space dynamically as evidence warrants, making it suitable for exploratory analysis where topic count is unknown.","intents":["I need to discover topics without knowing the optimal number in advance","I want automatic topic count selection based on data rather than manual tuning","I need a model that can discover rare topics without forcing them into a fixed topic budget"],"best_for":["Exploratory data analysis on document collections with unknown topic structure","Researchers studying topic hierarchies and topic sharing patterns","Systems where manual topic count tuning is infeasible or undesirable"],"limitations":["Significantly slower convergence than LDA due to DP sampling complexity","Inferred topic count can be unstable across runs with different random seeds","Requires more data than LDA to achieve stable topic discovery (typically 1000+ documents minimum)","Variational inference approximation can underestimate true number of topics","No incremental learning support — requires full model retraining for new documents"],"requires":["Python 2.7+ or 3.5+","NumPy and SciPy","Corpus with minimum 500+ documents for meaningful topic discovery","Significant computational resources (HDP is 5-10x slower than LDA)"],"input_types":["bag-of-words corpus","document-term matrix"],"output_types":["automatically-determined topic count","topic-word distributions","document-topic distributions"],"categories":["data-processing-analysis","planning-reasoning"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_3","uri":"capability://data.processing.analysis.word2vec.distributed.word.embeddings.skip.gram.and.cbow","name":"word2vec distributed word embeddings (skip-gram and cbow)","description":"Learns dense vector representations of words by predicting context words (Skip-gram) or predicting target words from context (CBOW) using shallow neural networks. Gensim implements both architectures with negative sampling and hierarchical softmax for efficient training on large vocabularies. The model captures semantic and syntactic relationships in continuous vector space, enabling word analogy tasks and semantic similarity computation without explicit feature engineering.","intents":["I need semantic word representations for downstream NLP tasks like classification or clustering","I want to perform word analogy tasks (king - man + woman = queen) and semantic similarity queries","I need to initialize neural network embeddings with pre-trained semantic knowledge"],"best_for":["NLP practitioners building semantic similarity and analogy systems","Teams using embeddings as features for downstream machine learning models","Researchers studying word semantics and linguistic relationships"],"limitations":["Requires large corpus (100k+ tokens minimum) for meaningful embeddings; small corpora produce poor quality","No out-of-vocabulary (OOV) handling — unseen words have no representation unless using subword models","Training is single-threaded in pure Python; multi-threading has GIL contention overhead","Hyperparameter sensitivity: window size, negative samples, and learning rate significantly impact quality","No contextual embeddings — same word gets same vector regardless of context (unlike BERT/ELMo)"],"requires":["Python 2.7+ or 3.5+","NumPy for vector operations","Corpus with minimum 100k tokens for stable embeddings","Tokenized and preprocessed text"],"input_types":["tokenized sentences","raw text (with tokenization)","corpus of documents"],"output_types":["word vectors (dense embeddings)","similarity scores between words","analogy predictions"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_4","uri":"capability://data.processing.analysis.fasttext.subword.embeddings.with.character.n.grams","name":"fasttext subword embeddings with character n-grams","description":"Extends Word2Vec by representing words as bags of character n-grams, enabling embeddings for out-of-vocabulary (OOV) words and capturing morphological information. Gensim wraps the fastText algorithm, allowing words to be decomposed into subword units (e.g., 'running' = 'run' + 'nin' + 'ing' + special tokens), so unseen words get representations based on their character composition. This approach handles rare words, misspellings, and morphologically rich languages better than standard Word2Vec.","intents":["I need embeddings for rare or misspelled words that don't appear in training data","I want to capture morphological structure in word representations for inflected languages","I need robust word vectors for domains with high OOV rates (medical, technical terminology)"],"best_for":["NLP systems handling morphologically rich languages (Finnish, Turkish, German)","Applications with high out-of-vocabulary rates (medical, legal, technical domains)","Teams needing robust embeddings for noisy or misspelled text"],"limitations":["Slower training than Word2Vec due to character n-gram computation overhead (2-3x slower)","Larger model size due to storing subword vectors in addition to word vectors","Character n-gram parameters (min_n, max_n) require tuning; poor choices degrade quality","Less interpretable than Word2Vec — subword vectors don't correspond to meaningful linguistic units","Gensim's fastText wrapper is slower than native fastText C++ implementation"],"requires":["Python 2.7+ or 3.5+","NumPy","Corpus with minimum 100k tokens","Tokenized text (character n-grams are computed automatically)"],"input_types":["tokenized sentences","raw text with tokenization","corpus of documents"],"output_types":["word vectors with subword information","OOV word vectors (computed from character n-grams)","similarity scores for rare/unseen words"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_5","uri":"capability://data.processing.analysis.doc2vec.document.embeddings.paragraph.vector","name":"doc2vec document embeddings (paragraph vector)","description":"Learns fixed-size vector representations for entire documents by extending Word2Vec with a document ID token that acts as a memory of document context. Gensim implements both Distributed Memory (DM) and Distributed Bag-of-Words (DBOW) variants, training document vectors alongside word vectors through the same neural network objective. This enables semantic similarity between documents and document classification without explicit feature engineering.","intents":["I need fixed-size semantic representations for entire documents for clustering or classification","I want to find semantically similar documents without keyword matching","I need to initialize document embeddings for downstream supervised learning tasks"],"best_for":["Document clustering and similarity systems","Teams building document-level semantic search","Researchers studying document-level semantic representations"],"limitations":["Requires inference step for new documents (slower than pre-computed embeddings)","Training is sensitive to document length — very short or very long documents produce poor embeddings","No standard way to combine document and word vectors for hybrid representations","Inference requires multiple passes over document text; no single-pass encoding like BERT","Document vectors are not comparable across different model training runs without alignment"],"requires":["Python 2.7+ or 3.5+","NumPy","Corpus with minimum 100+ documents","Tokenized documents"],"input_types":["tokenized documents","document collections","raw text with tokenization"],"output_types":["document vectors (fixed-size embeddings)","document similarity scores","document-document distance matrices"],"categories":["data-processing-analysis","memory-knowledge"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_6","uri":"capability://data.processing.analysis.tf.idf.vectorization.with.corpus.statistics","name":"tf-idf vectorization with corpus statistics","description":"Computes TF-IDF (Term Frequency-Inverse Document Frequency) weights for documents using corpus-wide statistics to identify important terms. Gensim implements TF-IDF as a transformation that learns IDF weights from a training corpus and applies them to new documents, supporting both standard TF-IDF and sublinear TF scaling. The implementation integrates with Gensim's corpus abstraction, enabling memory-efficient processing of large document collections.","intents":["I need to weight terms by importance, emphasizing rare discriminative terms over common ones","I want to convert raw bag-of-words representations into TF-IDF weighted vectors","I need to apply learned IDF statistics from a training corpus to new documents"],"best_for":["Information retrieval systems requiring term weighting","Teams building text classification pipelines with TF-IDF features","Researchers using TF-IDF as a baseline for semantic tasks"],"limitations":["TF-IDF is non-semantic — weights terms by frequency, not meaning; 'bank' gets same weight in financial vs. river contexts","Requires explicit IDF computation from training corpus; no transfer learning across domains","Sparse output requires sparse matrix support for memory efficiency","No built-in handling of synonyms or semantic relationships","Performance degrades with very large vocabularies (1M+ terms) due to sparse matrix operations"],"requires":["Python 2.7+ or 3.5+","NumPy and SciPy for sparse matrix operations","Training corpus to compute IDF statistics"],"input_types":["bag-of-words corpus","document-term matrices"],"output_types":["TF-IDF weighted vectors (sparse)","IDF statistics (learned model)"],"categories":["data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_7","uri":"capability://data.processing.analysis.dictionary.and.corpus.abstraction.for.memory.efficient.processing","name":"dictionary and corpus abstraction for memory-efficient processing","description":"Provides abstract corpus and dictionary interfaces that enable memory-efficient processing of document collections larger than RAM through lazy iteration and streaming. The Dictionary maps tokens to integer IDs and tracks corpus statistics, while the corpus abstraction allows documents to be processed one-at-a-time without loading the entire collection into memory. This architecture enables all Gensim models to work with arbitrarily large corpora by iterating through documents on-demand.","intents":["I need to process document collections larger than available RAM","I want to apply multiple transformations (tokenization, TF-IDF, LSI) in a pipeline without materializing intermediate results","I need to efficiently manage vocabulary and token-to-ID mappings across large corpora"],"best_for":["Teams processing multi-gigabyte document collections on memory-constrained systems","Researchers building NLP pipelines with multiple sequential transformations","Systems requiring incremental corpus updates without full reprocessing"],"limitations":["Iteration-based design prevents random access to documents; no indexing by document ID","Multiple passes over corpus require re-reading from disk; no caching between iterations","Dictionary updates are not thread-safe; concurrent corpus access requires external synchronization","Sparse matrix operations still require materializing individual documents in memory","No built-in compression; large dictionaries consume significant memory for token-to-ID mappings"],"requires":["Python 2.7+ or 3.5+","Corpus data source (files, database, API)","Custom corpus class implementation for non-standard data sources"],"input_types":["document iterators","file paths","database connections","streaming data sources"],"output_types":["bag-of-words corpus objects","dictionary objects (token-to-ID mappings)","corpus statistics"],"categories":["data-processing-analysis","automation-workflow"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_8","uri":"capability://search.retrieval.semantic.similarity.and.distance.computation","name":"semantic similarity and distance computation","description":"Computes semantic similarity between documents, words, or queries using learned representations (embeddings, topic distributions, or TF-IDF vectors). Gensim provides similarity interfaces that support multiple distance metrics (cosine, Euclidean, Jaccard) and enable efficient similarity queries through sparse matrix operations and optional indexing. The abstraction works with any vector representation, enabling similarity computation across different model types.","intents":["I need to find the most similar documents to a query without exhaustive search","I want to compute pairwise similarity between all documents in a collection","I need to rank documents by semantic relevance to a query"],"best_for":["Search and recommendation systems requiring semantic ranking","Document clustering and deduplication pipelines","Researchers studying semantic similarity metrics"],"limitations":["Exhaustive similarity computation is O(n*m) where n is corpus size and m is query count; no sublinear approximations","Sparse similarity matrices can be memory-intensive for large corpora (millions of documents)","Similarity metrics are symmetric; no support for asymmetric relevance (query-to-document vs. document-to-query)","No built-in approximate nearest neighbor search; requires external libraries (Annoy, FAISS) for scaling","Similarity quality depends entirely on quality of underlying representations"],"requires":["Python 2.7+ or 3.5+","NumPy and SciPy for sparse matrix operations","Trained model (embeddings, topic model, or TF-IDF) producing vector representations"],"input_types":["document vectors","query vectors","word vectors","sparse matrices"],"output_types":["similarity scores (0-1 range)","ranked document lists","similarity matrices"],"categories":["search-retrieval","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0},{"id":"pypi_pypi-gensim__cap_9","uri":"capability://automation.workflow.corpus.transformation.pipeline.composition","name":"corpus transformation pipeline composition","description":"Enables chaining multiple transformations (TF-IDF, LSI, LDA, normalization) into sequential pipelines that process documents through multiple stages. Gensim implements transformations as objects that learn statistics from training data and apply transformations to new documents, supporting composition through the corpus iteration interface. This enables building complex NLP pipelines (e.g., tokenize → TF-IDF → LSI → similarity) without materializing intermediate representations.","intents":["I need to apply multiple sequential transformations to documents in a single pipeline","I want to learn transformation parameters from training data and apply them consistently to new documents","I need to compose complex NLP workflows without materializing intermediate results"],"best_for":["NLP engineers building production text processing pipelines","Researchers experimenting with transformation combinations","Teams requiring reproducible, composable text processing workflows"],"limitations":["Pipeline composition is sequential only; no support for branching or conditional transformations","Transformation parameters are not automatically tuned; manual hyperparameter selection required","No built-in validation or cross-validation for pipeline evaluation","Debugging pipeline failures is difficult; errors propagate through multiple transformation stages","No automatic caching of intermediate results; repeated pipeline runs recompute all stages"],"requires":["Python 2.7+ or 3.5+","Multiple Gensim transformation objects (TfidfModel, LsiModel, etc.)","Training corpus for learning transformation parameters"],"input_types":["bag-of-words corpus","document-term matrices"],"output_types":["transformed corpus objects","pipeline objects (serializable)"],"categories":["automation-workflow","data-processing-analysis"],"confidence":0.5,"matches":0,"success_rate":0}],"trust":{"score":29,"verified":false,"data_access_risk":"high","permissions":["Python 2.7+ or 3.5+","NumPy and SciPy for linear algebra operations","ARPACK library (typically bundled with SciPy)","Corpus with at least 100+ documents for meaningful semantic discovery","NumPy for numerical operations","Corpus with minimum 50+ documents for stable topic discovery","Preprocessed text (tokenization, stopword removal, lowercasing)","File system access for reading/writing model files","Same Gensim version for serialization and deserialization (or compatible versions)","Corpus data source"],"failure_modes":["SVD computation scales O(n²) with vocabulary size; becomes slow beyond 100k+ unique terms","Requires dense matrix operations for final similarity computation despite sparse input","No incremental updates — must recompute entire decomposition when corpus changes","Semantic quality degrades with very short documents or sparse term distributions","Requires manual tuning of number of topics — no automatic selection mechanism","Convergence is slow for large vocabularies (100k+ terms); typically requires 10-50 passes","Topic interpretability depends heavily on preprocessing quality; garbage input produces garbage topics","Gibbs sampling variant is single-threaded; variational inference is slower than modern neural alternatives","No built-in coherence evaluation — requires external metrics to assess topic quality","Pickle format is Python-specific; models cannot be loaded in other languages without custom deserialization","builder identity is not verified yet","no observed match outcomes yet"],"rank_breakdown":{"adoption":0.05,"quality":0.35,"ecosystem":0.7000000000000001,"match_graph":0.25,"freshness":0.52,"weights":{"adoption":0.3,"quality":0.2,"ecosystem":0.15,"match_graph":0.3,"freshness":0.05}},"observed_outcomes":{"matches":0,"success_rate":0,"avg_confidence":0,"top_intents":[],"last_matched_at":null},"maintenance":{"status":"active","updated_at":"2026-05-24T12:16:25.060Z","last_scraped_at":"2026-05-03T15:20:25.058Z","last_commit":null},"community":{"stars":null,"forks":null,"weekly_downloads":null,"model_downloads":null,"model_likes":null}},"distribution":{"claim_url":"https://unfragile.ai/submit?claim=pypi-gensim","compare_url":"https://unfragile.ai/compare?artifact=pypi-gensim"}},"signature":"BRwrtQ1qzg3kfQvvDPXP5VleHTfrlEO2JvKpNoLt4h6nINUaJARnpC9FR/ndD9lNNaCCsmZOrtwH5Ha/+PlPDg==","signedAt":"2026-06-19T09:52:52.751Z","signedBy":"unfragile.ai","version":1},"_links":{"self":"https://unfragile.ai/api/v1/passport/pypi-gensim","artifact":"https://unfragile.ai/pypi-gensim","verify":"https://unfragile.ai/api/v1/verify?slug=pypi-gensim","publicKey":"https://unfragile.ai/api/v1/trust-passport-public-key","spec":"https://unfragile.ai/trust","schema":"https://unfragile.ai/schema.json","docs":"https://unfragile.ai/docs"}}