paperless-ngx/src/documents/ml/semantic_search.py

"""
Semantic Search for IntelliDocs-ngx.

Provides search by meaning rather than just keyword matching.
Uses sentence embeddings to understand the semantic content of documents.

Examples:
- Query: "tax documents from 2023"
  Finds: Documents about taxes, returns, deductions from 2023

- Query: "medical bills"
  Finds: Invoices from hospitals, clinics, prescriptions, insurance claims

- Query: "employment contract"
  Finds: Job offers, agreements, NDAs, work contracts
"""

from __future__ import annotations

import logging
from pathlib import Path
from typing import TYPE_CHECKING

import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util

if TYPE_CHECKING:
    pass

logger = logging.getLogger("paperless.ml.semantic_search")


class SemanticSearch:
    """
    Semantic search using sentence embeddings.

    Creates vector representations of documents and queries,
    then finds similar documents using cosine similarity.

    This provides much better search results than keyword matching:
    - Understands synonyms (invoice = bill)
    - Understands context (medical + bill = healthcare invoice)
    - Finds related concepts (tax = IRS, deduction, return)
    """

    def __init__(
        self,
        model_name: str = "all-MiniLM-L6-v2",
        cache_dir: str | None = None,
    ):
        """
        Initialize semantic search.

        Args:
            model_name: Sentence transformer model
                       Default: all-MiniLM-L6-v2 (80MB, fast, good quality)
                       Alternatives:
                       - paraphrase-multilingual-MiniLM-L12-v2 (multilingual)
                       - all-mpnet-base-v2 (420MB, highest quality)
                       - all-MiniLM-L12-v2 (120MB, balanced)
            cache_dir: Directory to cache model
        """
        logger.info(f"Initializing SemanticSearch with model: {model_name}")

        self.model_name = model_name
        self.model = SentenceTransformer(model_name, cache_folder=cache_dir)

        # Storage for embeddings
        # In production, this should be in a vector database like Faiss or Milvus
        self.document_embeddings = {}
        self.document_metadata = {}

        logger.info("SemanticSearch initialized successfully")

    def index_document(
        self,
        document_id: int,
        text: str,
        metadata: dict | None = None,
    ) -> None:
        """
        Index a document for semantic search.

        Creates an embedding vector for the document and stores it.

        Args:
            document_id: Document ID
            text: Document text content
            metadata: Optional metadata (title, date, tags, etc.)
        """
        logger.debug(f"Indexing document {document_id}")

        # Create embedding
        embedding = self.model.encode(
            text,
            convert_to_tensor=True,
            show_progress_bar=False,
        )

        # Store embedding and metadata
        self.document_embeddings[document_id] = embedding
        self.document_metadata[document_id] = metadata or {}

    def index_documents_batch(
        self,
        documents: list[tuple[int, str, dict | None]],
        batch_size: int = 32,
    ) -> None:
        """
        Index multiple documents efficiently.

        Args:
            documents: List of (document_id, text, metadata) tuples
            batch_size: Batch size for encoding
        """
        logger.info(f"Batch indexing {len(documents)} documents")

        # Process in batches for efficiency
        for i in range(0, len(documents), batch_size):
            batch = documents[i : i + batch_size]

            # Extract texts and IDs
            doc_ids = [doc[0] for doc in batch]
            texts = [doc[1] for doc in batch]
            metadatas = [doc[2] or {} for doc in batch]

            # Create embeddings for batch
            embeddings = self.model.encode(
                texts,
                convert_to_tensor=True,
                show_progress_bar=False,
                batch_size=batch_size,
            )

            # Store embeddings and metadata
            for doc_id, embedding, metadata in zip(doc_ids, embeddings, metadatas):
                self.document_embeddings[doc_id] = embedding
                self.document_metadata[doc_id] = metadata

        logger.info(f"Indexed {len(documents)} documents successfully")

    def search(
        self,
        query: str,
        top_k: int = 10,
        min_score: float = 0.0,
    ) -> list[tuple[int, float]]:
        """
        Search documents by semantic similarity.

        Args:
            query: Search query
            top_k: Number of results to return
            min_score: Minimum similarity score (0-1)

        Returns:
            list: List of (document_id, similarity_score) tuples
                  Sorted by similarity (highest first)
        """
        if not self.document_embeddings:
            logger.warning("No documents indexed")
            return []

        logger.info(f"Searching for: '{query}' (top_k={top_k})")

        # Create query embedding
        query_embedding = self.model.encode(
            query,
            convert_to_tensor=True,
            show_progress_bar=False,
        )

        # Calculate similarities with all documents
        similarities = []
        for doc_id, doc_embedding in self.document_embeddings.items():
            similarity = util.cos_sim(query_embedding, doc_embedding).item()

            # Only include if above minimum score
            if similarity >= min_score:
                similarities.append((doc_id, similarity))

        # Sort by similarity (highest first)
        similarities.sort(key=lambda x: x[1], reverse=True)

        # Return top k
        results = similarities[:top_k]

        logger.info(f"Found {len(results)} results")
        return results

    def search_with_metadata(
        self,
        query: str,
        top_k: int = 10,
        min_score: float = 0.0,
    ) -> list[dict]:
        """
        Search and return results with metadata.

        Args:
            query: Search query
            top_k: Number of results to return
            min_score: Minimum similarity score (0-1)

        Returns:
            list: List of result dictionaries
                  [
                      {
                          'document_id': 123,
                          'score': 0.85,
                          'metadata': {...}
                      },
                      ...
                  ]
        """
        # Get basic results
        results = self.search(query, top_k, min_score)

        # Add metadata
        results_with_metadata = []
        for doc_id, score in results:
            results_with_metadata.append(
                {
                    "document_id": doc_id,
                    "score": score,
                    "metadata": self.document_metadata.get(doc_id, {}),
                },
            )

        return results_with_metadata

    def find_similar_documents(
        self,
        document_id: int,
        top_k: int = 10,
        min_score: float = 0.3,
    ) -> list[tuple[int, float]]:
        """
        Find documents similar to a given document.

        Useful for "Find similar" functionality.

        Args:
            document_id: Document ID to find similar documents for
            top_k: Number of results to return
            min_score: Minimum similarity score (0-1)

        Returns:
            list: List of (document_id, similarity_score) tuples
                  Excludes the source document
        """
        if document_id not in self.document_embeddings:
            logger.warning(f"Document {document_id} not indexed")
            return []

        logger.info(f"Finding documents similar to {document_id}")

        # Get source document embedding
        source_embedding = self.document_embeddings[document_id]

        # Calculate similarities with all other documents
        similarities = []
        for doc_id, doc_embedding in self.document_embeddings.items():
            # Skip the source document itself
            if doc_id == document_id:
                continue

            similarity = util.cos_sim(source_embedding, doc_embedding).item()

            # Only include if above minimum score
            if similarity >= min_score:
                similarities.append((doc_id, similarity))

        # Sort by similarity (highest first)
        similarities.sort(key=lambda x: x[1], reverse=True)

        # Return top k
        results = similarities[:top_k]

        logger.info(f"Found {len(results)} similar documents")
        return results

    def remove_document(self, document_id: int) -> bool:
        """
        Remove a document from the index.

        Args:
            document_id: Document ID to remove

        Returns:
            bool: True if document was removed, False if not found
        """
        if document_id in self.document_embeddings:
            del self.document_embeddings[document_id]
            del self.document_metadata[document_id]
            logger.debug(f"Removed document {document_id} from index")
            return True

        return False

    def clear_index(self) -> None:
        """Clear all indexed documents."""
        self.document_embeddings.clear()
        self.document_metadata.clear()
        logger.info("Cleared all indexed documents")

    def get_index_size(self) -> int:
        """
        Get number of indexed documents.

        Returns:
            int: Number of documents in index
        """
        return len(self.document_embeddings)

    def save_index(self, filepath: str) -> None:
        """
        Save index to disk.

        Args:
            filepath: Path to save index
        """
        logger.info(f"Saving index to {filepath}")

        index_data = {
            "model_name": self.model_name,
            "embeddings": {
                str(k): v.cpu().numpy() for k, v in self.document_embeddings.items()
            },
            "metadata": self.document_metadata,
        }

        torch.save(index_data, filepath)
        logger.info("Index saved successfully")

    def load_index(self, filepath: str) -> None:
        """
        Load index from disk.

        Args:
            filepath: Path to load index from
        """
        logger.info(f"Loading index from {filepath}")

        index_data = torch.load(filepath)

        # Verify model compatibility
        if index_data.get("model_name") != self.model_name:
            logger.warning(
                f"Loaded index was created with model {index_data.get('model_name')}, "
                f"but current model is {self.model_name}",
            )

        # Load embeddings
        self.document_embeddings = {
            int(k): torch.from_numpy(v) for k, v in index_data["embeddings"].items()
        }

        # Load metadata
        self.document_metadata = index_data["metadata"]

        logger.info(f"Loaded {len(self.document_embeddings)} documents from index")

    def get_model_info(self) -> dict:
        """
        Get information about the model and index.

        Returns:
            dict: Model and index information
        """
        return {
            "model_name": self.model_name,
            "indexed_documents": len(self.document_embeddings),
            "embedding_dimension": (
                self.model.get_sentence_embedding_dimension()
            ),
        }