Add comprehensive unit and integration tests for AI Scanner

Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2025-11-12 10:05:58 +00:00
parent 31d8fdc0d1
commit f63c7faa92
2 changed files with 1957 additions and 0 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,540 @@
"""
Integration tests for AI Scanner Module
These tests verify the AI scanner works correctly with real database
operations and model interactions, testing the full workflow from
document consumption to metadata application.
"""
from unittest import mock
from django.test import TestCase, TransactionTestCase
from documents.ai_scanner import (
AIDocumentScanner,
AIScanResult,
get_ai_scanner,
)
from documents.models import (
Correspondent,
CustomField,
Document,
DocumentType,
StoragePath,
Tag,
Workflow,
WorkflowTrigger,
WorkflowAction,
)
class TestAIScannerIntegrationBasic(TestCase):
"""Test basic integration of AI scanner with database."""
def setUp(self):
"""Set up test data."""
self.document = Document.objects.create(
title="Invoice from ACME Corporation",
content="Invoice #12345 from ACME Corporation dated 2024-01-01. Total: $1,000"
)
self.tag_invoice = Tag.objects.create(
name="Invoice",
matching_algorithm=Tag.MATCH_AUTO,
match="invoice"
)
self.tag_important = Tag.objects.create(
name="Important",
matching_algorithm=Tag.MATCH_AUTO,
match="total"
)
self.correspondent = Correspondent.objects.create(
name="ACME Corporation",
matching_algorithm=Correspondent.MATCH_AUTO,
match="acme"
)
self.doc_type = DocumentType.objects.create(
name="Invoice",
matching_algorithm=DocumentType.MATCH_AUTO,
match="invoice"
)
self.storage_path = StoragePath.objects.create(
name="Invoices",
path="/invoices",
matching_algorithm=StoragePath.MATCH_AUTO,
match="invoice"
)
@mock.patch('documents.ai_scanner.match_tags')
@mock.patch('documents.ai_scanner.match_correspondents')
@mock.patch('documents.ai_scanner.match_document_types')
@mock.patch('documents.ai_scanner.match_storage_paths')
def test_full_scan_and_apply_workflow(
self,
mock_storage,
mock_types,
mock_correspondents,
mock_tags
):
"""Test complete workflow from scan to application."""
# Mock the matching functions to return our test data
mock_tags.return_value = [self.tag_invoice, self.tag_important]
mock_correspondents.return_value = [self.correspondent]
mock_types.return_value = [self.doc_type]
mock_storage.return_value = [self.storage_path]
scanner = AIDocumentScanner(auto_apply_threshold=0.80)
# Scan the document
scan_result = scanner.scan_document(
self.document,
self.document.content
)
# Verify scan results
self.assertIsNotNone(scan_result)
self.assertGreater(len(scan_result.tags), 0)
self.assertIsNotNone(scan_result.correspondent)
self.assertIsNotNone(scan_result.document_type)
self.assertIsNotNone(scan_result.storage_path)
# Apply the results
result = scanner.apply_scan_results(
self.document,
scan_result,
auto_apply=True
)
# Verify application
self.assertGreater(len(result["applied"]["tags"]), 0)
self.assertIsNotNone(result["applied"]["correspondent"])
# Verify database changes
self.document.refresh_from_db()
self.assertEqual(self.document.correspondent, self.correspondent)
self.assertEqual(self.document.document_type, self.doc_type)
self.assertEqual(self.document.storage_path, self.storage_path)
@mock.patch('documents.ai_scanner.match_tags')
def test_scan_with_no_matches(self, mock_tags):
"""Test scanning when no matches are found."""
mock_tags.return_value = []
scanner = AIDocumentScanner()
scan_result = scanner.scan_document(
self.document,
"Some random text with no matches"
)
# Should return empty results
self.assertEqual(len(scan_result.tags), 0)
self.assertIsNone(scan_result.correspondent)
self.assertIsNone(scan_result.document_type)
class TestAIScannerIntegrationCustomFields(TestCase):
"""Test AI scanner integration with custom fields."""
def setUp(self):
"""Set up test data with custom fields."""
self.document = Document.objects.create(
title="Invoice",
content="Invoice #INV-123 dated 2024-01-01. Amount: $1,500. Contact: john@example.com"
)
self.field_date = CustomField.objects.create(
name="Invoice Date",
data_type=CustomField.FieldDataType.DATE
)
self.field_number = CustomField.objects.create(
name="Invoice Number",
data_type=CustomField.FieldDataType.STRING
)
self.field_amount = CustomField.objects.create(
name="Total Amount",
data_type=CustomField.FieldDataType.STRING
)
self.field_email = CustomField.objects.create(
name="Contact Email",
data_type=CustomField.FieldDataType.STRING
)
def test_custom_field_extraction_integration(self):
"""Test custom field extraction with mocked NER."""
scanner = AIDocumentScanner()
# Mock NER to return entities
mock_ner = mock.MagicMock()
mock_ner.extract_all.return_value = {
"dates": [{"text": "2024-01-01"}],
"amounts": [{"text": "$1,500"}],
"invoice_numbers": ["INV-123"],
"emails": ["john@example.com"]
}
scanner._ner_extractor = mock_ner
# Scan document
scan_result = scanner.scan_document(self.document, self.document.content)
# Verify custom fields were extracted
self.assertGreater(len(scan_result.custom_fields), 0)
# Check specific fields
extracted_field_ids = list(scan_result.custom_fields.keys())
self.assertIn(self.field_date.id, extracted_field_ids)
self.assertIn(self.field_amount.id, extracted_field_ids)
class TestAIScannerIntegrationWorkflows(TestCase):
"""Test AI scanner integration with workflows."""
def setUp(self):
"""Set up test workflows."""
self.document = Document.objects.create(
title="Invoice",
content="Invoice document"
)
self.workflow1 = Workflow.objects.create(
name="Invoice Processing",
enabled=True
)
self.trigger1 = WorkflowTrigger.objects.create(
workflow=self.workflow1,
type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION
)
self.action1 = WorkflowAction.objects.create(
workflow=self.workflow1,
type=WorkflowAction.WorkflowActionType.ASSIGNMENT
)
self.workflow2 = Workflow.objects.create(
name="Archive Documents",
enabled=True
)
self.trigger2 = WorkflowTrigger.objects.create(
workflow=self.workflow2,
type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION
)
def test_workflow_suggestion_integration(self):
"""Test workflow suggestion with real workflows."""
scanner = AIDocumentScanner(suggest_threshold=0.5)
# Create scan result with some attributes
scan_result = AIScanResult()
scan_result.document_type = (1, 0.85)
scan_result.tags = [(1, 0.80)]
# Get workflow suggestions
workflows = scanner._suggest_workflows(
self.document,
self.document.content,
scan_result
)
# Should suggest workflows
self.assertGreater(len(workflows), 0)
workflow_ids = [wf_id for wf_id, _ in workflows]
self.assertIn(self.workflow1.id, workflow_ids)
class TestAIScannerIntegrationTransactions(TransactionTestCase):
"""Test AI scanner with transactions and rollbacks."""
def setUp(self):
"""Set up test data."""
self.document = Document.objects.create(
title="Test Document",
content="Test content"
)
self.tag = Tag.objects.create(name="TestTag")
self.correspondent = Correspondent.objects.create(name="TestCorp")
def test_transaction_rollback_on_error(self):
"""Test that transaction rolls back on error."""
scanner = AIDocumentScanner()
scan_result = AIScanResult()
scan_result.tags = [(self.tag.id, 0.90)]
scan_result.correspondent = (self.correspondent.id, 0.90)
# Force an error during save
original_save = Document.save
call_count = [0]
def failing_save(self, *args, **kwargs):
call_count[0] += 1
if call_count[0] >= 1:
raise Exception("Forced save failure")
return original_save(self, *args, **kwargs)
with mock.patch.object(Document, 'save', failing_save):
try:
scanner.apply_scan_results(
self.document,
scan_result,
auto_apply=True
)
except Exception:
pass
# Verify changes were rolled back
self.document.refresh_from_db()
# Document should not have been modified
class TestAIScannerIntegrationPerformance(TestCase):
"""Test AI scanner performance characteristics."""
def test_scan_multiple_documents(self):
"""Test scanning multiple documents efficiently."""
scanner = AIDocumentScanner()
documents = []
for i in range(5):
doc = Document.objects.create(
title=f"Document {i}",
content=f"Content for document {i}"
)
documents.append(doc)
# Mock to avoid actual ML loading
with mock.patch.object(scanner, '_extract_entities', return_value={}), \
mock.patch.object(scanner, '_suggest_tags', return_value=[]), \
mock.patch.object(scanner, '_detect_correspondent', return_value=None), \
mock.patch.object(scanner, '_classify_document_type', return_value=None), \
mock.patch.object(scanner, '_suggest_storage_path', return_value=None), \
mock.patch.object(scanner, '_extract_custom_fields', return_value={}), \
mock.patch.object(scanner, '_suggest_workflows', return_value=[]), \
mock.patch.object(scanner, '_suggest_title', return_value=None):
results = []
for doc in documents:
result = scanner.scan_document(doc, doc.content)
results.append(result)
# Verify all scans completed
self.assertEqual(len(results), 5)
for result in results:
self.assertIsInstance(result, AIScanResult)
class TestAIScannerIntegrationEntityMatching(TestCase):
"""Test entity-based matching integration."""
def setUp(self):
"""Set up test data."""
self.document = Document.objects.create(
title="Business Invoice",
content="Invoice from ACME Corporation"
)
self.correspondent_acme = Correspondent.objects.create(
name="ACME Corporation",
matching_algorithm=Correspondent.MATCH_AUTO
)
self.correspondent_other = Correspondent.objects.create(
name="Other Company",
matching_algorithm=Correspondent.MATCH_AUTO
)
def test_correspondent_matching_with_ner_entities(self):
"""Test that NER entities help match correspondents."""
scanner = AIDocumentScanner()
# Mock NER to extract organization
mock_ner = mock.MagicMock()
mock_ner.extract_all.return_value = {
"organizations": [{"text": "ACME Corporation"}]
}
scanner._ner_extractor = mock_ner
# Mock matching to return empty (so NER-based matching is used)
with mock.patch('documents.ai_scanner.match_correspondents', return_value=[]):
result = scanner._detect_correspondent(
self.document,
self.document.content,
{"organizations": [{"text": "ACME Corporation"}]}
)
# Should detect ACME correspondent
self.assertIsNotNone(result)
corr_id, confidence = result
self.assertEqual(corr_id, self.correspondent_acme.id)
class TestAIScannerIntegrationTitleGeneration(TestCase):
"""Test title generation integration."""
def test_title_generation_with_entities(self):
"""Test title generation uses extracted entities."""
scanner = AIDocumentScanner()
document = Document.objects.create(
title="document.pdf",
content="Invoice from ACME Corp dated 2024-01-15"
)
entities = {
"document_type": "Invoice",
"organizations": [{"text": "ACME Corp"}],
"dates": [{"text": "2024-01-15"}]
}
title = scanner._suggest_title(document, document.content, entities)
self.assertIsNotNone(title)
self.assertIn("Invoice", title)
self.assertIn("ACME Corp", title)
self.assertIn("2024-01-15", title)
class TestAIScannerIntegrationConfidenceLevels(TestCase):
"""Test confidence level handling in integration scenarios."""
def setUp(self):
"""Set up test data."""
self.document = Document.objects.create(
title="Test",
content="Test"
)
self.tag_high = Tag.objects.create(name="HighConfidence")
self.tag_medium = Tag.objects.create(name="MediumConfidence")
self.tag_low = Tag.objects.create(name="LowConfidence")
def test_confidence_based_application(self):
"""Test that only high confidence suggestions are auto-applied."""
scanner = AIDocumentScanner(
auto_apply_threshold=0.80,
suggest_threshold=0.60
)
scan_result = AIScanResult()
scan_result.tags = [
(self.tag_high.id, 0.90), # Should be applied
(self.tag_medium.id, 0.70), # Should be suggested
(self.tag_low.id, 0.50), # Should be ignored
]
result = scanner.apply_scan_results(
self.document,
scan_result,
auto_apply=True
)
# Verify high confidence was applied
self.assertEqual(len(result["applied"]["tags"]), 1)
self.assertEqual(result["applied"]["tags"][0]["id"], self.tag_high.id)
# Verify medium confidence was suggested
self.assertEqual(len(result["suggestions"]["tags"]), 1)
self.assertEqual(result["suggestions"]["tags"][0]["id"], self.tag_medium.id)
class TestAIScannerIntegrationGlobalInstance(TestCase):
"""Test global scanner instance integration."""
def test_global_scanner_reusability(self):
"""Test that global scanner can be reused across multiple scans."""
scanner1 = get_ai_scanner()
scanner2 = get_ai_scanner()
# Should be the same instance
self.assertIs(scanner1, scanner2)
# Should be functional
document = Document.objects.create(
title="Test",
content="Test content"
)
with mock.patch.object(scanner1, '_extract_entities', return_value={}), \
mock.patch.object(scanner1, '_suggest_tags', return_value=[]), \
mock.patch.object(scanner1, '_detect_correspondent', return_value=None), \
mock.patch.object(scanner1, '_classify_document_type', return_value=None), \
mock.patch.object(scanner1, '_suggest_storage_path', return_value=None), \
mock.patch.object(scanner1, '_extract_custom_fields', return_value={}), \
mock.patch.object(scanner1, '_suggest_workflows', return_value=[]), \
mock.patch.object(scanner1, '_suggest_title', return_value=None):
result1 = scanner1.scan_document(document, document.content)
result2 = scanner2.scan_document(document, document.content)
self.assertIsInstance(result1, AIScanResult)
self.assertIsInstance(result2, AIScanResult)
class TestAIScannerIntegrationEdgeCases(TestCase):
"""Test edge cases in integration scenarios."""
def test_scan_with_minimal_document(self):
"""Test scanning a document with minimal information."""
scanner = AIDocumentScanner()
document = Document.objects.create(
title="",
content=""
)
with mock.patch.object(scanner, '_extract_entities', return_value={}), \
mock.patch.object(scanner, '_suggest_tags', return_value=[]), \
mock.patch.object(scanner, '_detect_correspondent', return_value=None), \
mock.patch.object(scanner, '_classify_document_type', return_value=None), \
mock.patch.object(scanner, '_suggest_storage_path', return_value=None), \
mock.patch.object(scanner, '_extract_custom_fields', return_value={}), \
mock.patch.object(scanner, '_suggest_workflows', return_value=[]), \
mock.patch.object(scanner, '_suggest_title', return_value=None):
result = scanner.scan_document(document, document.content)
self.assertIsInstance(result, AIScanResult)
def test_apply_with_deleted_references(self):
"""Test applying results when referenced objects have been deleted."""
scanner = AIDocumentScanner()
document = Document.objects.create(
title="Test",
content="Test"
)
scan_result = AIScanResult()
scan_result.tags = [(9999, 0.90)] # Non-existent tag ID
scan_result.correspondent = (9999, 0.90) # Non-existent correspondent ID
# Should handle gracefully
result = scanner.apply_scan_results(
document,
scan_result,
auto_apply=True
)
# Should not crash, just log errors
self.assertEqual(len(result["applied"]["tags"]), 0)
def test_scan_with_unicode_and_special_characters(self):
"""Test scanning documents with Unicode and special characters."""
scanner = AIDocumentScanner()
document = Document.objects.create(
title="Factura - España 🇪🇸",
content="Société française • 日本語 • Ελληνικά • مرحبا"
)
with mock.patch.object(scanner, '_extract_entities', return_value={}), \
mock.patch.object(scanner, '_suggest_tags', return_value=[]), \
mock.patch.object(scanner, '_detect_correspondent', return_value=None), \
mock.patch.object(scanner, '_classify_document_type', return_value=None), \
mock.patch.object(scanner, '_suggest_storage_path', return_value=None), \
mock.patch.object(scanner, '_extract_custom_fields', return_value={}), \
mock.patch.object(scanner, '_suggest_workflows', return_value=[]), \
mock.patch.object(scanner, '_suggest_title', return_value=None):
result = scanner.scan_document(document, document.content)
self.assertIsInstance(result, AIScanResult)