paperless-ngx/src/documents/ml/semantic_search.py
copilot-swe-agent[bot] e33974f8f7 Implement Phase 3 AI/ML enhancement: BERT classification, NER, and semantic search
Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
2025-11-09 17:38:01 +00:00

378 lines
11 KiB
Python

"""
Semantic Search for IntelliDocs-ngx.
Provides search by meaning rather than just keyword matching.
Uses sentence embeddings to understand the semantic content of documents.
Examples:
- Query: "tax documents from 2023"
Finds: Documents about taxes, returns, deductions from 2023
- Query: "medical bills"
Finds: Invoices from hospitals, clinics, prescriptions, insurance claims
- Query: "employment contract"
Finds: Job offers, agreements, NDAs, work contracts
"""
from __future__ import annotations
import logging
from pathlib import Path
from typing import TYPE_CHECKING
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
if TYPE_CHECKING:
pass
logger = logging.getLogger("paperless.ml.semantic_search")
class SemanticSearch:
"""
Semantic search using sentence embeddings.
Creates vector representations of documents and queries,
then finds similar documents using cosine similarity.
This provides much better search results than keyword matching:
- Understands synonyms (invoice = bill)
- Understands context (medical + bill = healthcare invoice)
- Finds related concepts (tax = IRS, deduction, return)
"""
def __init__(
self,
model_name: str = "all-MiniLM-L6-v2",
cache_dir: str | None = None,
):
"""
Initialize semantic search.
Args:
model_name: Sentence transformer model
Default: all-MiniLM-L6-v2 (80MB, fast, good quality)
Alternatives:
- paraphrase-multilingual-MiniLM-L12-v2 (multilingual)
- all-mpnet-base-v2 (420MB, highest quality)
- all-MiniLM-L12-v2 (120MB, balanced)
cache_dir: Directory to cache model
"""
logger.info(f"Initializing SemanticSearch with model: {model_name}")
self.model_name = model_name
self.model = SentenceTransformer(model_name, cache_folder=cache_dir)
# Storage for embeddings
# In production, this should be in a vector database like Faiss or Milvus
self.document_embeddings = {}
self.document_metadata = {}
logger.info("SemanticSearch initialized successfully")
def index_document(
self,
document_id: int,
text: str,
metadata: dict | None = None,
) -> None:
"""
Index a document for semantic search.
Creates an embedding vector for the document and stores it.
Args:
document_id: Document ID
text: Document text content
metadata: Optional metadata (title, date, tags, etc.)
"""
logger.debug(f"Indexing document {document_id}")
# Create embedding
embedding = self.model.encode(
text,
convert_to_tensor=True,
show_progress_bar=False,
)
# Store embedding and metadata
self.document_embeddings[document_id] = embedding
self.document_metadata[document_id] = metadata or {}
def index_documents_batch(
self,
documents: list[tuple[int, str, dict | None]],
batch_size: int = 32,
) -> None:
"""
Index multiple documents efficiently.
Args:
documents: List of (document_id, text, metadata) tuples
batch_size: Batch size for encoding
"""
logger.info(f"Batch indexing {len(documents)} documents")
# Process in batches for efficiency
for i in range(0, len(documents), batch_size):
batch = documents[i : i + batch_size]
# Extract texts and IDs
doc_ids = [doc[0] for doc in batch]
texts = [doc[1] for doc in batch]
metadatas = [doc[2] or {} for doc in batch]
# Create embeddings for batch
embeddings = self.model.encode(
texts,
convert_to_tensor=True,
show_progress_bar=False,
batch_size=batch_size,
)
# Store embeddings and metadata
for doc_id, embedding, metadata in zip(doc_ids, embeddings, metadatas):
self.document_embeddings[doc_id] = embedding
self.document_metadata[doc_id] = metadata
logger.info(f"Indexed {len(documents)} documents successfully")
def search(
self,
query: str,
top_k: int = 10,
min_score: float = 0.0,
) -> list[tuple[int, float]]:
"""
Search documents by semantic similarity.
Args:
query: Search query
top_k: Number of results to return
min_score: Minimum similarity score (0-1)
Returns:
list: List of (document_id, similarity_score) tuples
Sorted by similarity (highest first)
"""
if not self.document_embeddings:
logger.warning("No documents indexed")
return []
logger.info(f"Searching for: '{query}' (top_k={top_k})")
# Create query embedding
query_embedding = self.model.encode(
query,
convert_to_tensor=True,
show_progress_bar=False,
)
# Calculate similarities with all documents
similarities = []
for doc_id, doc_embedding in self.document_embeddings.items():
similarity = util.cos_sim(query_embedding, doc_embedding).item()
# Only include if above minimum score
if similarity >= min_score:
similarities.append((doc_id, similarity))
# Sort by similarity (highest first)
similarities.sort(key=lambda x: x[1], reverse=True)
# Return top k
results = similarities[:top_k]
logger.info(f"Found {len(results)} results")
return results
def search_with_metadata(
self,
query: str,
top_k: int = 10,
min_score: float = 0.0,
) -> list[dict]:
"""
Search and return results with metadata.
Args:
query: Search query
top_k: Number of results to return
min_score: Minimum similarity score (0-1)
Returns:
list: List of result dictionaries
[
{
'document_id': 123,
'score': 0.85,
'metadata': {...}
},
...
]
"""
# Get basic results
results = self.search(query, top_k, min_score)
# Add metadata
results_with_metadata = []
for doc_id, score in results:
results_with_metadata.append(
{
"document_id": doc_id,
"score": score,
"metadata": self.document_metadata.get(doc_id, {}),
},
)
return results_with_metadata
def find_similar_documents(
self,
document_id: int,
top_k: int = 10,
min_score: float = 0.3,
) -> list[tuple[int, float]]:
"""
Find documents similar to a given document.
Useful for "Find similar" functionality.
Args:
document_id: Document ID to find similar documents for
top_k: Number of results to return
min_score: Minimum similarity score (0-1)
Returns:
list: List of (document_id, similarity_score) tuples
Excludes the source document
"""
if document_id not in self.document_embeddings:
logger.warning(f"Document {document_id} not indexed")
return []
logger.info(f"Finding documents similar to {document_id}")
# Get source document embedding
source_embedding = self.document_embeddings[document_id]
# Calculate similarities with all other documents
similarities = []
for doc_id, doc_embedding in self.document_embeddings.items():
# Skip the source document itself
if doc_id == document_id:
continue
similarity = util.cos_sim(source_embedding, doc_embedding).item()
# Only include if above minimum score
if similarity >= min_score:
similarities.append((doc_id, similarity))
# Sort by similarity (highest first)
similarities.sort(key=lambda x: x[1], reverse=True)
# Return top k
results = similarities[:top_k]
logger.info(f"Found {len(results)} similar documents")
return results
def remove_document(self, document_id: int) -> bool:
"""
Remove a document from the index.
Args:
document_id: Document ID to remove
Returns:
bool: True if document was removed, False if not found
"""
if document_id in self.document_embeddings:
del self.document_embeddings[document_id]
del self.document_metadata[document_id]
logger.debug(f"Removed document {document_id} from index")
return True
return False
def clear_index(self) -> None:
"""Clear all indexed documents."""
self.document_embeddings.clear()
self.document_metadata.clear()
logger.info("Cleared all indexed documents")
def get_index_size(self) -> int:
"""
Get number of indexed documents.
Returns:
int: Number of documents in index
"""
return len(self.document_embeddings)
def save_index(self, filepath: str) -> None:
"""
Save index to disk.
Args:
filepath: Path to save index
"""
logger.info(f"Saving index to {filepath}")
index_data = {
"model_name": self.model_name,
"embeddings": {
str(k): v.cpu().numpy() for k, v in self.document_embeddings.items()
},
"metadata": self.document_metadata,
}
torch.save(index_data, filepath)
logger.info("Index saved successfully")
def load_index(self, filepath: str) -> None:
"""
Load index from disk.
Args:
filepath: Path to load index from
"""
logger.info(f"Loading index from {filepath}")
index_data = torch.load(filepath)
# Verify model compatibility
if index_data.get("model_name") != self.model_name:
logger.warning(
f"Loaded index was created with model {index_data.get('model_name')}, "
f"but current model is {self.model_name}",
)
# Load embeddings
self.document_embeddings = {
int(k): torch.from_numpy(v) for k, v in index_data["embeddings"].items()
}
# Load metadata
self.document_metadata = index_data["metadata"]
logger.info(f"Loaded {len(self.document_embeddings)} documents from index")
def get_model_info(self) -> dict:
"""
Get information about the model and index.
Returns:
dict: Model and index information
"""
return {
"model_name": self.model_name,
"indexed_documents": len(self.document_embeddings),
"embedding_dimension": (
self.model.get_sentence_embedding_dimension()
),
}