paperless-ngx/src/paperless/tests/test_ai_rag.py

from unittest.mock import MagicMock
from unittest.mock import patch

import pytest

from documents.models import Document
from paperless.ai.embedding import build_llm_index_text
from paperless.ai.embedding import get_embedding_dim
from paperless.ai.embedding import get_embedding_model
from paperless.ai.rag import get_context_for_document
from paperless.models import LLMEmbeddingBackend


@pytest.fixture
def mock_document():
    doc = MagicMock(spec=Document)
    doc.title = "Test Title"
    doc.filename = "test_file.pdf"
    doc.created = "2023-01-01"
    doc.added = "2023-01-02"
    doc.modified = "2023-01-03"

    tag1 = MagicMock()
    tag1.name = "Tag1"
    tag2 = MagicMock()
    tag2.name = "Tag2"
    doc.tags.all = MagicMock(return_value=[tag1, tag2])

    doc.document_type = MagicMock()
    doc.document_type.name = "Invoice"
    doc.correspondent = MagicMock()
    doc.correspondent.name = "Test Correspondent"
    doc.archive_serial_number = "12345"
    doc.content = "This is the document content."

    cf1 = MagicMock(__str__=lambda x: "Value1")
    cf1.field = MagicMock()
    cf1.field.name = "Field1"
    cf1.value = "Value1"
    cf2 = MagicMock(__str__=lambda x: "Value2")
    cf2.field = MagicMock()
    cf2.field.name = "Field2"
    cf2.value = "Value2"
    doc.custom_fields.all = MagicMock(return_value=[cf1, cf2])

    return doc


@pytest.fixture
def mock_similar_documents():
    doc1 = MagicMock()
    doc1.content = "Content of document 1"
    doc1.title = "Title 1"
    doc1.filename = "file1.txt"

    doc2 = MagicMock()
    doc2.content = "Content of document 2"
    doc2.title = None
    doc2.filename = "file2.txt"

    doc3 = MagicMock()
    doc3.content = None
    doc3.title = None
    doc3.filename = None

    return [doc1, doc2, doc3]


@patch("paperless.ai.rag.query_similar_documents")
def test_get_context_for_document(
    mock_query_similar_documents,
    mock_document,
    mock_similar_documents,
):
    mock_query_similar_documents.return_value = mock_similar_documents

    result = get_context_for_document(mock_document, max_docs=2)

    expected_result = (
        "TITLE: Title 1\nContent of document 1\n\n"
        "TITLE: file2.txt\nContent of document 2"
    )
    assert result == expected_result
    mock_query_similar_documents.assert_called_once()


def test_get_context_for_document_no_similar_docs(mock_document):
    with patch("paperless.ai.rag.query_similar_documents", return_value=[]):
        result = get_context_for_document(mock_document)
        assert result == ""


# Embedding


@pytest.fixture
def mock_ai_config():
    with patch("paperless.ai.embedding.AIConfig") as MockAIConfig:
        yield MockAIConfig


def test_get_embedding_model_openai(mock_ai_config):
    mock_ai_config.return_value.llm_embedding_backend = LLMEmbeddingBackend.OPENAI
    mock_ai_config.return_value.llm_embedding_model = "text-embedding-3-small"
    mock_ai_config.return_value.llm_api_key = "test_api_key"

    with patch("paperless.ai.embedding.OpenAIEmbedding") as MockOpenAIEmbedding:
        model = get_embedding_model()
        MockOpenAIEmbedding.assert_called_once_with(
            model="text-embedding-3-small",
            api_key="test_api_key",
        )
        assert model == MockOpenAIEmbedding.return_value


def test_get_embedding_model_huggingface(mock_ai_config):
    mock_ai_config.return_value.llm_embedding_backend = LLMEmbeddingBackend.HUGGINGFACE
    mock_ai_config.return_value.llm_embedding_model = (
        "sentence-transformers/all-MiniLM-L6-v2"
    )

    with patch(
        "paperless.ai.embedding.HuggingFaceEmbedding",
    ) as MockHuggingFaceEmbedding:
        model = get_embedding_model()
        MockHuggingFaceEmbedding.assert_called_once_with(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
        )
        assert model == MockHuggingFaceEmbedding.return_value


def test_get_embedding_model_invalid_backend(mock_ai_config):
    mock_ai_config.return_value.llm_embedding_backend = "INVALID_BACKEND"

    with pytest.raises(
        ValueError,
        match="Unsupported embedding backend: INVALID_BACKEND",
    ):
        get_embedding_model()


def test_get_embedding_dim_openai(mock_ai_config):
    mock_ai_config.return_value.llm_embedding_backend = "openai"
    mock_ai_config.return_value.llm_embedding_model = None

    assert get_embedding_dim() == 1536


def test_get_embedding_dim_huggingface(mock_ai_config):
    mock_ai_config.return_value.llm_embedding_backend = "huggingface"
    mock_ai_config.return_value.llm_embedding_model = None

    assert get_embedding_dim() == 384


def test_get_embedding_dim_unknown_model(mock_ai_config):
    mock_ai_config.return_value.llm_embedding_backend = "openai"
    mock_ai_config.return_value.llm_embedding_model = "unknown-model"

    with pytest.raises(ValueError, match="Unknown embedding model: unknown-model"):
        get_embedding_dim()


def test_build_llm_index_text(mock_document):
    with patch("documents.models.Note.objects.filter") as mock_notes_filter:
        mock_notes_filter.return_value = [
            MagicMock(note="Note1"),
            MagicMock(note="Note2"),
        ]

        result = build_llm_index_text(mock_document)

        assert "Title: Test Title" in result
        assert "Filename: test_file.pdf" in result
        assert "Created: 2023-01-01" in result
        assert "Tags: Tag1, Tag2" in result
        assert "Document Type: Invoice" in result
        assert "Correspondent: Test Correspondent" in result
        assert "Notes: Note1,Note2" in result
        assert "Content:\n\nThis is the document content." in result
        assert "Custom Field - Field1: Value1\nCustom Field - Field2: Value2" in result