""" Semantic Search for IntelliDocs-ngx. Provides search by meaning rather than just keyword matching. Uses sentence embeddings to understand the semantic content of documents. Examples: - Query: "tax documents from 2023" Finds: Documents about taxes, returns, deductions from 2023 - Query: "medical bills" Finds: Invoices from hospitals, clinics, prescriptions, insurance claims - Query: "employment contract" Finds: Job offers, agreements, NDAs, work contracts """ from __future__ import annotations import logging from pathlib import Path from typing import TYPE_CHECKING import numpy as np import torch from sentence_transformers import SentenceTransformer, util if TYPE_CHECKING: pass logger = logging.getLogger("paperless.ml.semantic_search") class SemanticSearch: """ Semantic search using sentence embeddings. Creates vector representations of documents and queries, then finds similar documents using cosine similarity. This provides much better search results than keyword matching: - Understands synonyms (invoice = bill) - Understands context (medical + bill = healthcare invoice) - Finds related concepts (tax = IRS, deduction, return) """ def __init__( self, model_name: str = "all-MiniLM-L6-v2", cache_dir: str | None = None, ): """ Initialize semantic search. Args: model_name: Sentence transformer model Default: all-MiniLM-L6-v2 (80MB, fast, good quality) Alternatives: - paraphrase-multilingual-MiniLM-L12-v2 (multilingual) - all-mpnet-base-v2 (420MB, highest quality) - all-MiniLM-L12-v2 (120MB, balanced) cache_dir: Directory to cache model """ logger.info(f"Initializing SemanticSearch with model: {model_name}") self.model_name = model_name self.model = SentenceTransformer(model_name, cache_folder=cache_dir) # Storage for embeddings # In production, this should be in a vector database like Faiss or Milvus self.document_embeddings = {} self.document_metadata = {} logger.info("SemanticSearch initialized successfully") def index_document( self, document_id: int, text: str, metadata: dict | None = None, ) -> None: """ Index a document for semantic search. Creates an embedding vector for the document and stores it. Args: document_id: Document ID text: Document text content metadata: Optional metadata (title, date, tags, etc.) """ logger.debug(f"Indexing document {document_id}") # Create embedding embedding = self.model.encode( text, convert_to_tensor=True, show_progress_bar=False, ) # Store embedding and metadata self.document_embeddings[document_id] = embedding self.document_metadata[document_id] = metadata or {} def index_documents_batch( self, documents: list[tuple[int, str, dict | None]], batch_size: int = 32, ) -> None: """ Index multiple documents efficiently. Args: documents: List of (document_id, text, metadata) tuples batch_size: Batch size for encoding """ logger.info(f"Batch indexing {len(documents)} documents") # Process in batches for efficiency for i in range(0, len(documents), batch_size): batch = documents[i : i + batch_size] # Extract texts and IDs doc_ids = [doc[0] for doc in batch] texts = [doc[1] for doc in batch] metadatas = [doc[2] or {} for doc in batch] # Create embeddings for batch embeddings = self.model.encode( texts, convert_to_tensor=True, show_progress_bar=False, batch_size=batch_size, ) # Store embeddings and metadata for doc_id, embedding, metadata in zip(doc_ids, embeddings, metadatas): self.document_embeddings[doc_id] = embedding self.document_metadata[doc_id] = metadata logger.info(f"Indexed {len(documents)} documents successfully") def search( self, query: str, top_k: int = 10, min_score: float = 0.0, ) -> list[tuple[int, float]]: """ Search documents by semantic similarity. Args: query: Search query top_k: Number of results to return min_score: Minimum similarity score (0-1) Returns: list: List of (document_id, similarity_score) tuples Sorted by similarity (highest first) """ if not self.document_embeddings: logger.warning("No documents indexed") return [] logger.info(f"Searching for: '{query}' (top_k={top_k})") # Create query embedding query_embedding = self.model.encode( query, convert_to_tensor=True, show_progress_bar=False, ) # Calculate similarities with all documents similarities = [] for doc_id, doc_embedding in self.document_embeddings.items(): similarity = util.cos_sim(query_embedding, doc_embedding).item() # Only include if above minimum score if similarity >= min_score: similarities.append((doc_id, similarity)) # Sort by similarity (highest first) similarities.sort(key=lambda x: x[1], reverse=True) # Return top k results = similarities[:top_k] logger.info(f"Found {len(results)} results") return results def search_with_metadata( self, query: str, top_k: int = 10, min_score: float = 0.0, ) -> list[dict]: """ Search and return results with metadata. Args: query: Search query top_k: Number of results to return min_score: Minimum similarity score (0-1) Returns: list: List of result dictionaries [ { 'document_id': 123, 'score': 0.85, 'metadata': {...} }, ... ] """ # Get basic results results = self.search(query, top_k, min_score) # Add metadata results_with_metadata = [] for doc_id, score in results: results_with_metadata.append( { "document_id": doc_id, "score": score, "metadata": self.document_metadata.get(doc_id, {}), }, ) return results_with_metadata def find_similar_documents( self, document_id: int, top_k: int = 10, min_score: float = 0.3, ) -> list[tuple[int, float]]: """ Find documents similar to a given document. Useful for "Find similar" functionality. Args: document_id: Document ID to find similar documents for top_k: Number of results to return min_score: Minimum similarity score (0-1) Returns: list: List of (document_id, similarity_score) tuples Excludes the source document """ if document_id not in self.document_embeddings: logger.warning(f"Document {document_id} not indexed") return [] logger.info(f"Finding documents similar to {document_id}") # Get source document embedding source_embedding = self.document_embeddings[document_id] # Calculate similarities with all other documents similarities = [] for doc_id, doc_embedding in self.document_embeddings.items(): # Skip the source document itself if doc_id == document_id: continue similarity = util.cos_sim(source_embedding, doc_embedding).item() # Only include if above minimum score if similarity >= min_score: similarities.append((doc_id, similarity)) # Sort by similarity (highest first) similarities.sort(key=lambda x: x[1], reverse=True) # Return top k results = similarities[:top_k] logger.info(f"Found {len(results)} similar documents") return results def remove_document(self, document_id: int) -> bool: """ Remove a document from the index. Args: document_id: Document ID to remove Returns: bool: True if document was removed, False if not found """ if document_id in self.document_embeddings: del self.document_embeddings[document_id] del self.document_metadata[document_id] logger.debug(f"Removed document {document_id} from index") return True return False def clear_index(self) -> None: """Clear all indexed documents.""" self.document_embeddings.clear() self.document_metadata.clear() logger.info("Cleared all indexed documents") def get_index_size(self) -> int: """ Get number of indexed documents. Returns: int: Number of documents in index """ return len(self.document_embeddings) def save_index(self, filepath: str) -> None: """ Save index to disk. Args: filepath: Path to save index """ logger.info(f"Saving index to {filepath}") index_data = { "model_name": self.model_name, "embeddings": { str(k): v.cpu().numpy() for k, v in self.document_embeddings.items() }, "metadata": self.document_metadata, } torch.save(index_data, filepath) logger.info("Index saved successfully") def load_index(self, filepath: str) -> None: """ Load index from disk. Args: filepath: Path to load index from """ logger.info(f"Loading index from {filepath}") index_data = torch.load(filepath) # Verify model compatibility if index_data.get("model_name") != self.model_name: logger.warning( f"Loaded index was created with model {index_data.get('model_name')}, " f"but current model is {self.model_name}", ) # Load embeddings self.document_embeddings = { int(k): torch.from_numpy(v) for k, v in index_data["embeddings"].items() } # Load metadata self.document_metadata = index_data["metadata"] logger.info(f"Loaded {len(self.document_embeddings)} documents from index") def get_model_info(self) -> dict: """ Get information about the model and index. Returns: dict: Model and index information """ return { "model_name": self.model_name, "indexed_documents": len(self.document_embeddings), "embedding_dimension": ( self.model.get_sentence_embedding_dimension() ), }