Implement Phase 3 AI/ML enhancement: BERT classification, NER, and semantic search

Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
2025-12-14 18:46:52 +01:00 · 2025-11-09 17:38:01 +00:00 · 2025-11-09 17:38:01 +00:00 · e33974f8f7
commit e33974f8f7
parent 36a1939b16
6 changed files with 2371 additions and 0 deletions
--- a/src/documents/ml/semantic_search.py
+++ b/src/documents/ml/semantic_search.py
@ -0,0 +1,378 @@
+"""
+Semantic Search for IntelliDocs-ngx.
+
+Provides search by meaning rather than just keyword matching.
+Uses sentence embeddings to understand the semantic content of documents.
+
+Examples:
+- Query: "tax documents from 2023"
+  Finds: Documents about taxes, returns, deductions from 2023
+  
+- Query: "medical bills"
+  Finds: Invoices from hospitals, clinics, prescriptions, insurance claims
+  
+- Query: "employment contract"
+  Finds: Job offers, agreements, NDAs, work contracts
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import numpy as np
+import torch
+from sentence_transformers import SentenceTransformer, util
+
+if TYPE_CHECKING:
+    pass
+
+logger = logging.getLogger("paperless.ml.semantic_search")
+
+
+class SemanticSearch:
+    """
+    Semantic search using sentence embeddings.
+    
+    Creates vector representations of documents and queries,
+    then finds similar documents using cosine similarity.
+    
+    This provides much better search results than keyword matching:
+    - Understands synonyms (invoice = bill)
+    - Understands context (medical + bill = healthcare invoice)
+    - Finds related concepts (tax = IRS, deduction, return)
+    """
+
+    def __init__(
+        self,
+        model_name: str = "all-MiniLM-L6-v2",
+        cache_dir: str | None = None,
+    ):
+        """
+        Initialize semantic search.
+        
+        Args:
+            model_name: Sentence transformer model
+                       Default: all-MiniLM-L6-v2 (80MB, fast, good quality)
+                       Alternatives:
+                       - paraphrase-multilingual-MiniLM-L12-v2 (multilingual)
+                       - all-mpnet-base-v2 (420MB, highest quality)
+                       - all-MiniLM-L12-v2 (120MB, balanced)
+            cache_dir: Directory to cache model
+        """
+        logger.info(f"Initializing SemanticSearch with model: {model_name}")
+
+        self.model_name = model_name
+        self.model = SentenceTransformer(model_name, cache_folder=cache_dir)
+
+        # Storage for embeddings
+        # In production, this should be in a vector database like Faiss or Milvus
+        self.document_embeddings = {}
+        self.document_metadata = {}
+
+        logger.info("SemanticSearch initialized successfully")
+
+    def index_document(
+        self,
+        document_id: int,
+        text: str,
+        metadata: dict | None = None,
+    ) -> None:
+        """
+        Index a document for semantic search.
+        
+        Creates an embedding vector for the document and stores it.
+        
+        Args:
+            document_id: Document ID
+            text: Document text content
+            metadata: Optional metadata (title, date, tags, etc.)
+        """
+        logger.debug(f"Indexing document {document_id}")
+
+        # Create embedding
+        embedding = self.model.encode(
+            text,
+            convert_to_tensor=True,
+            show_progress_bar=False,
+        )
+
+        # Store embedding and metadata
+        self.document_embeddings[document_id] = embedding
+        self.document_metadata[document_id] = metadata or {}
+
+    def index_documents_batch(
+        self,
+        documents: list[tuple[int, str, dict | None]],
+        batch_size: int = 32,
+    ) -> None:
+        """
+        Index multiple documents efficiently.
+        
+        Args:
+            documents: List of (document_id, text, metadata) tuples
+            batch_size: Batch size for encoding
+        """
+        logger.info(f"Batch indexing {len(documents)} documents")
+
+        # Process in batches for efficiency
+        for i in range(0, len(documents), batch_size):
+            batch = documents[i : i + batch_size]
+
+            # Extract texts and IDs
+            doc_ids = [doc[0] for doc in batch]
+            texts = [doc[1] for doc in batch]
+            metadatas = [doc[2] or {} for doc in batch]
+
+            # Create embeddings for batch
+            embeddings = self.model.encode(
+                texts,
+                convert_to_tensor=True,
+                show_progress_bar=False,
+                batch_size=batch_size,
+            )
+
+            # Store embeddings and metadata
+            for doc_id, embedding, metadata in zip(doc_ids, embeddings, metadatas):
+                self.document_embeddings[doc_id] = embedding
+                self.document_metadata[doc_id] = metadata
+
+        logger.info(f"Indexed {len(documents)} documents successfully")
+
+    def search(
+        self,
+        query: str,
+        top_k: int = 10,
+        min_score: float = 0.0,
+    ) -> list[tuple[int, float]]:
+        """
+        Search documents by semantic similarity.
+        
+        Args:
+            query: Search query
+            top_k: Number of results to return
+            min_score: Minimum similarity score (0-1)
+            
+        Returns:
+            list: List of (document_id, similarity_score) tuples
+                  Sorted by similarity (highest first)
+        """
+        if not self.document_embeddings:
+            logger.warning("No documents indexed")
+            return []
+
+        logger.info(f"Searching for: '{query}' (top_k={top_k})")
+
+        # Create query embedding
+        query_embedding = self.model.encode(
+            query,
+            convert_to_tensor=True,
+            show_progress_bar=False,
+        )
+
+        # Calculate similarities with all documents
+        similarities = []
+        for doc_id, doc_embedding in self.document_embeddings.items():
+            similarity = util.cos_sim(query_embedding, doc_embedding).item()
+
+            # Only include if above minimum score
+            if similarity >= min_score:
+                similarities.append((doc_id, similarity))
+
+        # Sort by similarity (highest first)
+        similarities.sort(key=lambda x: x[1], reverse=True)
+
+        # Return top k
+        results = similarities[:top_k]
+
+        logger.info(f"Found {len(results)} results")
+        return results
+
+    def search_with_metadata(
+        self,
+        query: str,
+        top_k: int = 10,
+        min_score: float = 0.0,
+    ) -> list[dict]:
+        """
+        Search and return results with metadata.
+        
+        Args:
+            query: Search query
+            top_k: Number of results to return
+            min_score: Minimum similarity score (0-1)
+            
+        Returns:
+            list: List of result dictionaries
+                  [
+                      {
+                          'document_id': 123,
+                          'score': 0.85,
+                          'metadata': {...}
+                      },
+                      ...
+                  ]
+        """
+        # Get basic results
+        results = self.search(query, top_k, min_score)
+
+        # Add metadata
+        results_with_metadata = []
+        for doc_id, score in results:
+            results_with_metadata.append(
+                {
+                    "document_id": doc_id,
+                    "score": score,
+                    "metadata": self.document_metadata.get(doc_id, {}),
+                },
+            )
+
+        return results_with_metadata
+
+    def find_similar_documents(
+        self,
+        document_id: int,
+        top_k: int = 10,
+        min_score: float = 0.3,
+    ) -> list[tuple[int, float]]:
+        """
+        Find documents similar to a given document.
+        
+        Useful for "Find similar" functionality.
+        
+        Args:
+            document_id: Document ID to find similar documents for
+            top_k: Number of results to return
+            min_score: Minimum similarity score (0-1)
+            
+        Returns:
+            list: List of (document_id, similarity_score) tuples
+                  Excludes the source document
+        """
+        if document_id not in self.document_embeddings:
+            logger.warning(f"Document {document_id} not indexed")
+            return []
+
+        logger.info(f"Finding documents similar to {document_id}")
+
+        # Get source document embedding
+        source_embedding = self.document_embeddings[document_id]
+
+        # Calculate similarities with all other documents
+        similarities = []
+        for doc_id, doc_embedding in self.document_embeddings.items():
+            # Skip the source document itself
+            if doc_id == document_id:
+                continue
+
+            similarity = util.cos_sim(source_embedding, doc_embedding).item()
+
+            # Only include if above minimum score
+            if similarity >= min_score:
+                similarities.append((doc_id, similarity))
+
+        # Sort by similarity (highest first)
+        similarities.sort(key=lambda x: x[1], reverse=True)
+
+        # Return top k
+        results = similarities[:top_k]
+
+        logger.info(f"Found {len(results)} similar documents")
+        return results
+
+    def remove_document(self, document_id: int) -> bool:
+        """
+        Remove a document from the index.
+        
+        Args:
+            document_id: Document ID to remove
+            
+        Returns:
+            bool: True if document was removed, False if not found
+        """
+        if document_id in self.document_embeddings:
+            del self.document_embeddings[document_id]
+            del self.document_metadata[document_id]
+            logger.debug(f"Removed document {document_id} from index")
+            return True
+
+        return False
+
+    def clear_index(self) -> None:
+        """Clear all indexed documents."""
+        self.document_embeddings.clear()
+        self.document_metadata.clear()
+        logger.info("Cleared all indexed documents")
+
+    def get_index_size(self) -> int:
+        """
+        Get number of indexed documents.
+        
+        Returns:
+            int: Number of documents in index
+        """
+        return len(self.document_embeddings)
+
+    def save_index(self, filepath: str) -> None:
+        """
+        Save index to disk.
+        
+        Args:
+            filepath: Path to save index
+        """
+        logger.info(f"Saving index to {filepath}")
+
+        index_data = {
+            "model_name": self.model_name,
+            "embeddings": {
+                str(k): v.cpu().numpy() for k, v in self.document_embeddings.items()
+            },
+            "metadata": self.document_metadata,
+        }
+
+        torch.save(index_data, filepath)
+        logger.info("Index saved successfully")
+
+    def load_index(self, filepath: str) -> None:
+        """
+        Load index from disk.
+        
+        Args:
+            filepath: Path to load index from
+        """
+        logger.info(f"Loading index from {filepath}")
+
+        index_data = torch.load(filepath)
+
+        # Verify model compatibility
+        if index_data.get("model_name") != self.model_name:
+            logger.warning(
+                f"Loaded index was created with model {index_data.get('model_name')}, "
+                f"but current model is {self.model_name}",
+            )
+
+        # Load embeddings
+        self.document_embeddings = {
+            int(k): torch.from_numpy(v) for k, v in index_data["embeddings"].items()
+        }
+
+        # Load metadata
+        self.document_metadata = index_data["metadata"]
+
+        logger.info(f"Loaded {len(self.document_embeddings)} documents from index")
+
+    def get_model_info(self) -> dict:
+        """
+        Get information about the model and index.
+        
+        Returns:
+            dict: Model and index information
+        """
+        return {
+            "model_name": self.model_name,
+            "indexed_documents": len(self.document_embeddings),
+            "embedding_dimension": (
+                self.model.get_sentence_embedding_dimension()
+            ),
+        }