mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-10 08:37:19 +01:00
Add comprehensive unit and integration tests for AI Scanner
Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
This commit is contained in:
parent
31d8fdc0d1
commit
f63c7faa92
2 changed files with 1957 additions and 0 deletions
1417
src/documents/tests/test_ai_scanner.py
Normal file
1417
src/documents/tests/test_ai_scanner.py
Normal file
File diff suppressed because it is too large
Load diff
540
src/documents/tests/test_ai_scanner_integration.py
Normal file
540
src/documents/tests/test_ai_scanner_integration.py
Normal file
|
|
@ -0,0 +1,540 @@
|
|||
"""
|
||||
Integration tests for AI Scanner Module
|
||||
|
||||
These tests verify the AI scanner works correctly with real database
|
||||
operations and model interactions, testing the full workflow from
|
||||
document consumption to metadata application.
|
||||
"""
|
||||
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase, TransactionTestCase
|
||||
|
||||
from documents.ai_scanner import (
|
||||
AIDocumentScanner,
|
||||
AIScanResult,
|
||||
get_ai_scanner,
|
||||
)
|
||||
from documents.models import (
|
||||
Correspondent,
|
||||
CustomField,
|
||||
Document,
|
||||
DocumentType,
|
||||
StoragePath,
|
||||
Tag,
|
||||
Workflow,
|
||||
WorkflowTrigger,
|
||||
WorkflowAction,
|
||||
)
|
||||
|
||||
|
||||
class TestAIScannerIntegrationBasic(TestCase):
|
||||
"""Test basic integration of AI scanner with database."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test data."""
|
||||
self.document = Document.objects.create(
|
||||
title="Invoice from ACME Corporation",
|
||||
content="Invoice #12345 from ACME Corporation dated 2024-01-01. Total: $1,000"
|
||||
)
|
||||
|
||||
self.tag_invoice = Tag.objects.create(
|
||||
name="Invoice",
|
||||
matching_algorithm=Tag.MATCH_AUTO,
|
||||
match="invoice"
|
||||
)
|
||||
self.tag_important = Tag.objects.create(
|
||||
name="Important",
|
||||
matching_algorithm=Tag.MATCH_AUTO,
|
||||
match="total"
|
||||
)
|
||||
|
||||
self.correspondent = Correspondent.objects.create(
|
||||
name="ACME Corporation",
|
||||
matching_algorithm=Correspondent.MATCH_AUTO,
|
||||
match="acme"
|
||||
)
|
||||
|
||||
self.doc_type = DocumentType.objects.create(
|
||||
name="Invoice",
|
||||
matching_algorithm=DocumentType.MATCH_AUTO,
|
||||
match="invoice"
|
||||
)
|
||||
|
||||
self.storage_path = StoragePath.objects.create(
|
||||
name="Invoices",
|
||||
path="/invoices",
|
||||
matching_algorithm=StoragePath.MATCH_AUTO,
|
||||
match="invoice"
|
||||
)
|
||||
|
||||
@mock.patch('documents.ai_scanner.match_tags')
|
||||
@mock.patch('documents.ai_scanner.match_correspondents')
|
||||
@mock.patch('documents.ai_scanner.match_document_types')
|
||||
@mock.patch('documents.ai_scanner.match_storage_paths')
|
||||
def test_full_scan_and_apply_workflow(
|
||||
self,
|
||||
mock_storage,
|
||||
mock_types,
|
||||
mock_correspondents,
|
||||
mock_tags
|
||||
):
|
||||
"""Test complete workflow from scan to application."""
|
||||
# Mock the matching functions to return our test data
|
||||
mock_tags.return_value = [self.tag_invoice, self.tag_important]
|
||||
mock_correspondents.return_value = [self.correspondent]
|
||||
mock_types.return_value = [self.doc_type]
|
||||
mock_storage.return_value = [self.storage_path]
|
||||
|
||||
scanner = AIDocumentScanner(auto_apply_threshold=0.80)
|
||||
|
||||
# Scan the document
|
||||
scan_result = scanner.scan_document(
|
||||
self.document,
|
||||
self.document.content
|
||||
)
|
||||
|
||||
# Verify scan results
|
||||
self.assertIsNotNone(scan_result)
|
||||
self.assertGreater(len(scan_result.tags), 0)
|
||||
self.assertIsNotNone(scan_result.correspondent)
|
||||
self.assertIsNotNone(scan_result.document_type)
|
||||
self.assertIsNotNone(scan_result.storage_path)
|
||||
|
||||
# Apply the results
|
||||
result = scanner.apply_scan_results(
|
||||
self.document,
|
||||
scan_result,
|
||||
auto_apply=True
|
||||
)
|
||||
|
||||
# Verify application
|
||||
self.assertGreater(len(result["applied"]["tags"]), 0)
|
||||
self.assertIsNotNone(result["applied"]["correspondent"])
|
||||
|
||||
# Verify database changes
|
||||
self.document.refresh_from_db()
|
||||
self.assertEqual(self.document.correspondent, self.correspondent)
|
||||
self.assertEqual(self.document.document_type, self.doc_type)
|
||||
self.assertEqual(self.document.storage_path, self.storage_path)
|
||||
|
||||
@mock.patch('documents.ai_scanner.match_tags')
|
||||
def test_scan_with_no_matches(self, mock_tags):
|
||||
"""Test scanning when no matches are found."""
|
||||
mock_tags.return_value = []
|
||||
|
||||
scanner = AIDocumentScanner()
|
||||
|
||||
scan_result = scanner.scan_document(
|
||||
self.document,
|
||||
"Some random text with no matches"
|
||||
)
|
||||
|
||||
# Should return empty results
|
||||
self.assertEqual(len(scan_result.tags), 0)
|
||||
self.assertIsNone(scan_result.correspondent)
|
||||
self.assertIsNone(scan_result.document_type)
|
||||
|
||||
|
||||
class TestAIScannerIntegrationCustomFields(TestCase):
|
||||
"""Test AI scanner integration with custom fields."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test data with custom fields."""
|
||||
self.document = Document.objects.create(
|
||||
title="Invoice",
|
||||
content="Invoice #INV-123 dated 2024-01-01. Amount: $1,500. Contact: john@example.com"
|
||||
)
|
||||
|
||||
self.field_date = CustomField.objects.create(
|
||||
name="Invoice Date",
|
||||
data_type=CustomField.FieldDataType.DATE
|
||||
)
|
||||
self.field_number = CustomField.objects.create(
|
||||
name="Invoice Number",
|
||||
data_type=CustomField.FieldDataType.STRING
|
||||
)
|
||||
self.field_amount = CustomField.objects.create(
|
||||
name="Total Amount",
|
||||
data_type=CustomField.FieldDataType.STRING
|
||||
)
|
||||
self.field_email = CustomField.objects.create(
|
||||
name="Contact Email",
|
||||
data_type=CustomField.FieldDataType.STRING
|
||||
)
|
||||
|
||||
def test_custom_field_extraction_integration(self):
|
||||
"""Test custom field extraction with mocked NER."""
|
||||
scanner = AIDocumentScanner()
|
||||
|
||||
# Mock NER to return entities
|
||||
mock_ner = mock.MagicMock()
|
||||
mock_ner.extract_all.return_value = {
|
||||
"dates": [{"text": "2024-01-01"}],
|
||||
"amounts": [{"text": "$1,500"}],
|
||||
"invoice_numbers": ["INV-123"],
|
||||
"emails": ["john@example.com"]
|
||||
}
|
||||
scanner._ner_extractor = mock_ner
|
||||
|
||||
# Scan document
|
||||
scan_result = scanner.scan_document(self.document, self.document.content)
|
||||
|
||||
# Verify custom fields were extracted
|
||||
self.assertGreater(len(scan_result.custom_fields), 0)
|
||||
|
||||
# Check specific fields
|
||||
extracted_field_ids = list(scan_result.custom_fields.keys())
|
||||
self.assertIn(self.field_date.id, extracted_field_ids)
|
||||
self.assertIn(self.field_amount.id, extracted_field_ids)
|
||||
|
||||
|
||||
class TestAIScannerIntegrationWorkflows(TestCase):
|
||||
"""Test AI scanner integration with workflows."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test workflows."""
|
||||
self.document = Document.objects.create(
|
||||
title="Invoice",
|
||||
content="Invoice document"
|
||||
)
|
||||
|
||||
self.workflow1 = Workflow.objects.create(
|
||||
name="Invoice Processing",
|
||||
enabled=True
|
||||
)
|
||||
self.trigger1 = WorkflowTrigger.objects.create(
|
||||
workflow=self.workflow1,
|
||||
type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION
|
||||
)
|
||||
self.action1 = WorkflowAction.objects.create(
|
||||
workflow=self.workflow1,
|
||||
type=WorkflowAction.WorkflowActionType.ASSIGNMENT
|
||||
)
|
||||
|
||||
self.workflow2 = Workflow.objects.create(
|
||||
name="Archive Documents",
|
||||
enabled=True
|
||||
)
|
||||
self.trigger2 = WorkflowTrigger.objects.create(
|
||||
workflow=self.workflow2,
|
||||
type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION
|
||||
)
|
||||
|
||||
def test_workflow_suggestion_integration(self):
|
||||
"""Test workflow suggestion with real workflows."""
|
||||
scanner = AIDocumentScanner(suggest_threshold=0.5)
|
||||
|
||||
# Create scan result with some attributes
|
||||
scan_result = AIScanResult()
|
||||
scan_result.document_type = (1, 0.85)
|
||||
scan_result.tags = [(1, 0.80)]
|
||||
|
||||
# Get workflow suggestions
|
||||
workflows = scanner._suggest_workflows(
|
||||
self.document,
|
||||
self.document.content,
|
||||
scan_result
|
||||
)
|
||||
|
||||
# Should suggest workflows
|
||||
self.assertGreater(len(workflows), 0)
|
||||
workflow_ids = [wf_id for wf_id, _ in workflows]
|
||||
self.assertIn(self.workflow1.id, workflow_ids)
|
||||
|
||||
|
||||
class TestAIScannerIntegrationTransactions(TransactionTestCase):
|
||||
"""Test AI scanner with transactions and rollbacks."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test data."""
|
||||
self.document = Document.objects.create(
|
||||
title="Test Document",
|
||||
content="Test content"
|
||||
)
|
||||
self.tag = Tag.objects.create(name="TestTag")
|
||||
self.correspondent = Correspondent.objects.create(name="TestCorp")
|
||||
|
||||
def test_transaction_rollback_on_error(self):
|
||||
"""Test that transaction rolls back on error."""
|
||||
scanner = AIDocumentScanner()
|
||||
|
||||
scan_result = AIScanResult()
|
||||
scan_result.tags = [(self.tag.id, 0.90)]
|
||||
scan_result.correspondent = (self.correspondent.id, 0.90)
|
||||
|
||||
# Force an error during save
|
||||
original_save = Document.save
|
||||
call_count = [0]
|
||||
|
||||
def failing_save(self, *args, **kwargs):
|
||||
call_count[0] += 1
|
||||
if call_count[0] >= 1:
|
||||
raise Exception("Forced save failure")
|
||||
return original_save(self, *args, **kwargs)
|
||||
|
||||
with mock.patch.object(Document, 'save', failing_save):
|
||||
try:
|
||||
scanner.apply_scan_results(
|
||||
self.document,
|
||||
scan_result,
|
||||
auto_apply=True
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Verify changes were rolled back
|
||||
self.document.refresh_from_db()
|
||||
# Document should not have been modified
|
||||
|
||||
|
||||
class TestAIScannerIntegrationPerformance(TestCase):
|
||||
"""Test AI scanner performance characteristics."""
|
||||
|
||||
def test_scan_multiple_documents(self):
|
||||
"""Test scanning multiple documents efficiently."""
|
||||
scanner = AIDocumentScanner()
|
||||
|
||||
documents = []
|
||||
for i in range(5):
|
||||
doc = Document.objects.create(
|
||||
title=f"Document {i}",
|
||||
content=f"Content for document {i}"
|
||||
)
|
||||
documents.append(doc)
|
||||
|
||||
# Mock to avoid actual ML loading
|
||||
with mock.patch.object(scanner, '_extract_entities', return_value={}), \
|
||||
mock.patch.object(scanner, '_suggest_tags', return_value=[]), \
|
||||
mock.patch.object(scanner, '_detect_correspondent', return_value=None), \
|
||||
mock.patch.object(scanner, '_classify_document_type', return_value=None), \
|
||||
mock.patch.object(scanner, '_suggest_storage_path', return_value=None), \
|
||||
mock.patch.object(scanner, '_extract_custom_fields', return_value={}), \
|
||||
mock.patch.object(scanner, '_suggest_workflows', return_value=[]), \
|
||||
mock.patch.object(scanner, '_suggest_title', return_value=None):
|
||||
|
||||
results = []
|
||||
for doc in documents:
|
||||
result = scanner.scan_document(doc, doc.content)
|
||||
results.append(result)
|
||||
|
||||
# Verify all scans completed
|
||||
self.assertEqual(len(results), 5)
|
||||
for result in results:
|
||||
self.assertIsInstance(result, AIScanResult)
|
||||
|
||||
|
||||
class TestAIScannerIntegrationEntityMatching(TestCase):
|
||||
"""Test entity-based matching integration."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test data."""
|
||||
self.document = Document.objects.create(
|
||||
title="Business Invoice",
|
||||
content="Invoice from ACME Corporation"
|
||||
)
|
||||
|
||||
self.correspondent_acme = Correspondent.objects.create(
|
||||
name="ACME Corporation",
|
||||
matching_algorithm=Correspondent.MATCH_AUTO
|
||||
)
|
||||
self.correspondent_other = Correspondent.objects.create(
|
||||
name="Other Company",
|
||||
matching_algorithm=Correspondent.MATCH_AUTO
|
||||
)
|
||||
|
||||
def test_correspondent_matching_with_ner_entities(self):
|
||||
"""Test that NER entities help match correspondents."""
|
||||
scanner = AIDocumentScanner()
|
||||
|
||||
# Mock NER to extract organization
|
||||
mock_ner = mock.MagicMock()
|
||||
mock_ner.extract_all.return_value = {
|
||||
"organizations": [{"text": "ACME Corporation"}]
|
||||
}
|
||||
scanner._ner_extractor = mock_ner
|
||||
|
||||
# Mock matching to return empty (so NER-based matching is used)
|
||||
with mock.patch('documents.ai_scanner.match_correspondents', return_value=[]):
|
||||
result = scanner._detect_correspondent(
|
||||
self.document,
|
||||
self.document.content,
|
||||
{"organizations": [{"text": "ACME Corporation"}]}
|
||||
)
|
||||
|
||||
# Should detect ACME correspondent
|
||||
self.assertIsNotNone(result)
|
||||
corr_id, confidence = result
|
||||
self.assertEqual(corr_id, self.correspondent_acme.id)
|
||||
|
||||
|
||||
class TestAIScannerIntegrationTitleGeneration(TestCase):
|
||||
"""Test title generation integration."""
|
||||
|
||||
def test_title_generation_with_entities(self):
|
||||
"""Test title generation uses extracted entities."""
|
||||
scanner = AIDocumentScanner()
|
||||
|
||||
document = Document.objects.create(
|
||||
title="document.pdf",
|
||||
content="Invoice from ACME Corp dated 2024-01-15"
|
||||
)
|
||||
|
||||
entities = {
|
||||
"document_type": "Invoice",
|
||||
"organizations": [{"text": "ACME Corp"}],
|
||||
"dates": [{"text": "2024-01-15"}]
|
||||
}
|
||||
|
||||
title = scanner._suggest_title(document, document.content, entities)
|
||||
|
||||
self.assertIsNotNone(title)
|
||||
self.assertIn("Invoice", title)
|
||||
self.assertIn("ACME Corp", title)
|
||||
self.assertIn("2024-01-15", title)
|
||||
|
||||
|
||||
class TestAIScannerIntegrationConfidenceLevels(TestCase):
|
||||
"""Test confidence level handling in integration scenarios."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test data."""
|
||||
self.document = Document.objects.create(
|
||||
title="Test",
|
||||
content="Test"
|
||||
)
|
||||
self.tag_high = Tag.objects.create(name="HighConfidence")
|
||||
self.tag_medium = Tag.objects.create(name="MediumConfidence")
|
||||
self.tag_low = Tag.objects.create(name="LowConfidence")
|
||||
|
||||
def test_confidence_based_application(self):
|
||||
"""Test that only high confidence suggestions are auto-applied."""
|
||||
scanner = AIDocumentScanner(
|
||||
auto_apply_threshold=0.80,
|
||||
suggest_threshold=0.60
|
||||
)
|
||||
|
||||
scan_result = AIScanResult()
|
||||
scan_result.tags = [
|
||||
(self.tag_high.id, 0.90), # Should be applied
|
||||
(self.tag_medium.id, 0.70), # Should be suggested
|
||||
(self.tag_low.id, 0.50), # Should be ignored
|
||||
]
|
||||
|
||||
result = scanner.apply_scan_results(
|
||||
self.document,
|
||||
scan_result,
|
||||
auto_apply=True
|
||||
)
|
||||
|
||||
# Verify high confidence was applied
|
||||
self.assertEqual(len(result["applied"]["tags"]), 1)
|
||||
self.assertEqual(result["applied"]["tags"][0]["id"], self.tag_high.id)
|
||||
|
||||
# Verify medium confidence was suggested
|
||||
self.assertEqual(len(result["suggestions"]["tags"]), 1)
|
||||
self.assertEqual(result["suggestions"]["tags"][0]["id"], self.tag_medium.id)
|
||||
|
||||
|
||||
class TestAIScannerIntegrationGlobalInstance(TestCase):
|
||||
"""Test global scanner instance integration."""
|
||||
|
||||
def test_global_scanner_reusability(self):
|
||||
"""Test that global scanner can be reused across multiple scans."""
|
||||
scanner1 = get_ai_scanner()
|
||||
scanner2 = get_ai_scanner()
|
||||
|
||||
# Should be the same instance
|
||||
self.assertIs(scanner1, scanner2)
|
||||
|
||||
# Should be functional
|
||||
document = Document.objects.create(
|
||||
title="Test",
|
||||
content="Test content"
|
||||
)
|
||||
|
||||
with mock.patch.object(scanner1, '_extract_entities', return_value={}), \
|
||||
mock.patch.object(scanner1, '_suggest_tags', return_value=[]), \
|
||||
mock.patch.object(scanner1, '_detect_correspondent', return_value=None), \
|
||||
mock.patch.object(scanner1, '_classify_document_type', return_value=None), \
|
||||
mock.patch.object(scanner1, '_suggest_storage_path', return_value=None), \
|
||||
mock.patch.object(scanner1, '_extract_custom_fields', return_value={}), \
|
||||
mock.patch.object(scanner1, '_suggest_workflows', return_value=[]), \
|
||||
mock.patch.object(scanner1, '_suggest_title', return_value=None):
|
||||
|
||||
result1 = scanner1.scan_document(document, document.content)
|
||||
result2 = scanner2.scan_document(document, document.content)
|
||||
|
||||
self.assertIsInstance(result1, AIScanResult)
|
||||
self.assertIsInstance(result2, AIScanResult)
|
||||
|
||||
|
||||
class TestAIScannerIntegrationEdgeCases(TestCase):
|
||||
"""Test edge cases in integration scenarios."""
|
||||
|
||||
def test_scan_with_minimal_document(self):
|
||||
"""Test scanning a document with minimal information."""
|
||||
scanner = AIDocumentScanner()
|
||||
|
||||
document = Document.objects.create(
|
||||
title="",
|
||||
content=""
|
||||
)
|
||||
|
||||
with mock.patch.object(scanner, '_extract_entities', return_value={}), \
|
||||
mock.patch.object(scanner, '_suggest_tags', return_value=[]), \
|
||||
mock.patch.object(scanner, '_detect_correspondent', return_value=None), \
|
||||
mock.patch.object(scanner, '_classify_document_type', return_value=None), \
|
||||
mock.patch.object(scanner, '_suggest_storage_path', return_value=None), \
|
||||
mock.patch.object(scanner, '_extract_custom_fields', return_value={}), \
|
||||
mock.patch.object(scanner, '_suggest_workflows', return_value=[]), \
|
||||
mock.patch.object(scanner, '_suggest_title', return_value=None):
|
||||
|
||||
result = scanner.scan_document(document, document.content)
|
||||
|
||||
self.assertIsInstance(result, AIScanResult)
|
||||
|
||||
def test_apply_with_deleted_references(self):
|
||||
"""Test applying results when referenced objects have been deleted."""
|
||||
scanner = AIDocumentScanner()
|
||||
|
||||
document = Document.objects.create(
|
||||
title="Test",
|
||||
content="Test"
|
||||
)
|
||||
|
||||
scan_result = AIScanResult()
|
||||
scan_result.tags = [(9999, 0.90)] # Non-existent tag ID
|
||||
scan_result.correspondent = (9999, 0.90) # Non-existent correspondent ID
|
||||
|
||||
# Should handle gracefully
|
||||
result = scanner.apply_scan_results(
|
||||
document,
|
||||
scan_result,
|
||||
auto_apply=True
|
||||
)
|
||||
|
||||
# Should not crash, just log errors
|
||||
self.assertEqual(len(result["applied"]["tags"]), 0)
|
||||
|
||||
def test_scan_with_unicode_and_special_characters(self):
|
||||
"""Test scanning documents with Unicode and special characters."""
|
||||
scanner = AIDocumentScanner()
|
||||
|
||||
document = Document.objects.create(
|
||||
title="Factura - España 🇪🇸",
|
||||
content="Société française • 日本語 • Ελληνικά • مرحبا"
|
||||
)
|
||||
|
||||
with mock.patch.object(scanner, '_extract_entities', return_value={}), \
|
||||
mock.patch.object(scanner, '_suggest_tags', return_value=[]), \
|
||||
mock.patch.object(scanner, '_detect_correspondent', return_value=None), \
|
||||
mock.patch.object(scanner, '_classify_document_type', return_value=None), \
|
||||
mock.patch.object(scanner, '_suggest_storage_path', return_value=None), \
|
||||
mock.patch.object(scanner, '_extract_custom_fields', return_value={}), \
|
||||
mock.patch.object(scanner, '_suggest_workflows', return_value=[]), \
|
||||
mock.patch.object(scanner, '_suggest_title', return_value=None):
|
||||
|
||||
result = scanner.scan_document(document, document.content)
|
||||
|
||||
self.assertIsInstance(result, AIScanResult)
|
||||
Loading…
Add table
Add a link
Reference in a new issue