paperless-ngx/src/documents/ai_scanner.py
copilot-swe-agent[bot] cd5c7afdcd Changes before error encountered
Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
2025-11-12 15:42:23 +00:00

872 lines
32 KiB
Python

"""
AI Scanner Module for IntelliDocs-ngx
This module provides comprehensive AI-powered document scanning and metadata management.
It automatically analyzes documents on upload/consumption and manages:
- Tags
- Correspondents
- Document Types
- Storage Paths
- Custom Fields
- Workflow Assignments
According to agents.md requirements:
- AI scans every consumed/uploaded document
- AI suggests metadata for all manageable aspects
- AI cannot delete files without explicit user authorization
- AI must inform users comprehensively before any destructive action
"""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Dict, List, Optional, Any, Tuple
from django.conf import settings
from django.db import transaction
if TYPE_CHECKING:
from documents.models import (
Document,
Tag,
Correspondent,
DocumentType,
StoragePath,
CustomField,
Workflow,
)
logger = logging.getLogger("paperless.ai_scanner")
class AIScanResult:
"""
Container for AI scan results with confidence scores and suggestions.
"""
def __init__(self):
self.tags: List[Tuple[int, float]] = [] # [(tag_id, confidence), ...]
self.correspondent: Optional[Tuple[int, float]] = None # (correspondent_id, confidence)
self.document_type: Optional[Tuple[int, float]] = None # (document_type_id, confidence)
self.storage_path: Optional[Tuple[int, float]] = None # (storage_path_id, confidence)
self.custom_fields: Dict[int, Tuple[Any, float]] = {} # {field_id: (value, confidence), ...}
self.workflows: List[Tuple[int, float]] = [] # [(workflow_id, confidence), ...]
self.extracted_entities: Dict[str, Any] = {} # NER results
self.title_suggestion: Optional[str] = None
self.metadata: Dict[str, Any] = {} # Additional metadata
def to_dict(self) -> Dict[str, Any]:
"""Convert scan results to dictionary for logging/serialization."""
return {
"tags": self.tags,
"correspondent": self.correspondent,
"document_type": self.document_type,
"storage_path": self.storage_path,
"custom_fields": self.custom_fields,
"workflows": self.workflows,
"extracted_entities": self.extracted_entities,
"title_suggestion": self.title_suggestion,
"metadata": self.metadata,
}
class AIDocumentScanner:
"""
Comprehensive AI scanner for automatic document metadata management.
This scanner integrates all ML/AI capabilities to provide automatic:
- Tag assignment based on content analysis
- Correspondent detection from document text
- Document type classification
- Storage path suggestion based on content/type
- Custom field extraction using NER
- Workflow assignment based on document characteristics
Features:
- High confidence threshold (>80%) for automatic application
- Medium confidence (60-80%) for suggestions requiring user review
- Low confidence (<60%) logged but not suggested
- All decisions are logged for auditing
- No destructive operations without user confirmation
"""
def __init__(
self,
auto_apply_threshold: float = 0.80,
suggest_threshold: float = 0.60,
enable_ml_features: bool = None,
enable_advanced_ocr: bool = None,
):
"""
Initialize AI scanner.
Args:
auto_apply_threshold: Confidence threshold for automatic application (default: 0.80)
suggest_threshold: Confidence threshold for suggestions (default: 0.60)
enable_ml_features: Override for ML features (uses settings if None)
enable_advanced_ocr: Override for advanced OCR (uses settings if None)
"""
self.auto_apply_threshold = auto_apply_threshold
self.suggest_threshold = suggest_threshold
# Check settings for ML/OCR enablement
self.ml_enabled = (
enable_ml_features
if enable_ml_features is not None
else getattr(settings, "PAPERLESS_ENABLE_ML_FEATURES", True)
)
self.advanced_ocr_enabled = (
enable_advanced_ocr
if enable_advanced_ocr is not None
else getattr(settings, "PAPERLESS_ENABLE_ADVANCED_OCR", True)
)
# Lazy loading of ML components
self._classifier = None
self._ner_extractor = None
self._semantic_search = None
self._table_extractor = None
logger.info(
f"AIDocumentScanner initialized - ML: {self.ml_enabled}, "
f"Advanced OCR: {self.advanced_ocr_enabled}"
)
def _get_classifier(self):
"""Lazy load the ML classifier."""
if self._classifier is None and self.ml_enabled:
try:
from documents.ml.classifier import TransformerDocumentClassifier
self._classifier = TransformerDocumentClassifier()
logger.info("ML classifier loaded successfully")
except Exception as e:
logger.warning(f"Failed to load ML classifier: {e}")
self.ml_enabled = False
return self._classifier
def _get_ner_extractor(self):
"""Lazy load the NER extractor."""
if self._ner_extractor is None and self.ml_enabled:
try:
from documents.ml.ner import DocumentNER
self._ner_extractor = DocumentNER()
logger.info("NER extractor loaded successfully")
except Exception as e:
logger.warning(f"Failed to load NER extractor: {e}")
return self._ner_extractor
def _get_semantic_search(self):
"""Lazy load semantic search."""
if self._semantic_search is None and self.ml_enabled:
try:
from documents.ml.semantic_search import SemanticSearch
self._semantic_search = SemanticSearch()
logger.info("Semantic search loaded successfully")
except Exception as e:
logger.warning(f"Failed to load semantic search: {e}")
return self._semantic_search
def _get_table_extractor(self):
"""Lazy load table extractor."""
if self._table_extractor is None and self.advanced_ocr_enabled:
try:
from documents.ocr.table_extractor import TableExtractor
self._table_extractor = TableExtractor()
logger.info("Table extractor loaded successfully")
except Exception as e:
logger.warning(f"Failed to load table extractor: {e}")
return self._table_extractor
def scan_document(
self,
document: Document,
document_text: str,
original_file_path: str = None,
) -> AIScanResult:
"""
Perform comprehensive AI scan of a document.
This is the main entry point for document scanning. It orchestrates
all AI/ML components to analyze the document and generate suggestions.
Args:
document: The Document model instance
document_text: The extracted text content
original_file_path: Path to original file (for OCR/image analysis)
Returns:
AIScanResult containing all suggestions and extracted data
"""
logger.info(f"Starting AI scan for document: {document.title} (ID: {document.pk})")
result = AIScanResult()
# Extract entities using NER
result.extracted_entities = self._extract_entities(document_text)
# Analyze and suggest tags
result.tags = self._suggest_tags(document, document_text, result.extracted_entities)
# Detect correspondent
result.correspondent = self._detect_correspondent(
document, document_text, result.extracted_entities
)
# Classify document type
result.document_type = self._classify_document_type(
document, document_text, result.extracted_entities
)
# Suggest storage path
result.storage_path = self._suggest_storage_path(
document, document_text, result
)
# Extract custom fields
result.custom_fields = self._extract_custom_fields(
document, document_text, result.extracted_entities
)
# Suggest workflows
result.workflows = self._suggest_workflows(document, document_text, result)
# Generate improved title suggestion
result.title_suggestion = self._suggest_title(
document, document_text, result.extracted_entities
)
# Extract tables if advanced OCR enabled
if self.advanced_ocr_enabled and original_file_path:
result.metadata["tables"] = self._extract_tables(original_file_path)
logger.info(f"AI scan completed for document {document.pk}")
logger.debug(f"Scan results: {result.to_dict()}")
return result
def _extract_entities(self, text: str) -> Dict[str, Any]:
"""
Extract named entities from document text using NER.
Returns:
Dictionary with extracted entities (persons, orgs, dates, amounts, etc.)
"""
ner = self._get_ner_extractor()
if not ner:
return {}
try:
# Use extract_all to get comprehensive entity extraction
entities = ner.extract_all(text)
# Convert string lists to dict format for consistency
for key in ["persons", "organizations", "locations", "misc"]:
if key in entities and isinstance(entities[key], list):
entities[key] = [{"text": e} if isinstance(e, str) else e for e in entities[key]]
for key in ["dates", "amounts"]:
if key in entities and isinstance(entities[key], list):
entities[key] = [{"text": e} if isinstance(e, str) else e for e in entities[key]]
logger.debug(f"Extracted entities from NER")
return entities
except Exception as e:
logger.error(f"Entity extraction failed: {e}", exc_info=True)
return {}
def _suggest_tags(
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> List[Tuple[int, float]]:
"""
Suggest relevant tags based on document content and entities.
Uses a combination of:
- Keyword matching with existing tag patterns
- ML classification if available
- Entity-based suggestions (e.g., organization -> company tag)
Returns:
List of (tag_id, confidence) tuples
"""
from documents.models import Tag
from documents.matching import match_tags
suggestions = []
try:
# Use existing matching logic
matched_tags = match_tags(document, self._get_classifier())
# Add confidence scores based on matching strength
for tag in matched_tags:
confidence = 0.85 # High confidence for matched tags
suggestions.append((tag.id, confidence))
# Additional entity-based suggestions
if entities:
# Suggest tags based on detected entities
all_tags = Tag.objects.all()
# Check for organization entities -> company/business tags
if entities.get("organizations"):
for tag in all_tags.filter(name__icontains="company"):
suggestions.append((tag.id, 0.70))
# Check for date entities -> tax/financial tags if year-end
if entities.get("dates"):
for tag in all_tags.filter(name__icontains="tax"):
suggestions.append((tag.id, 0.65))
# Remove duplicates, keep highest confidence
seen = {}
for tag_id, conf in suggestions:
if tag_id not in seen or conf > seen[tag_id]:
seen[tag_id] = conf
suggestions = [(tid, conf) for tid, conf in seen.items()]
suggestions.sort(key=lambda x: x[1], reverse=True)
logger.debug(f"Suggested {len(suggestions)} tags")
except Exception as e:
logger.error(f"Tag suggestion failed: {e}", exc_info=True)
return suggestions
def _detect_correspondent(
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> Optional[Tuple[int, float]]:
"""
Detect correspondent based on document content and entities.
Uses:
- Organization entities from NER
- Email domains
- Existing correspondent matching patterns
Returns:
(correspondent_id, confidence) or None
"""
from documents.models import Correspondent
from documents.matching import match_correspondents
try:
# Use existing matching logic
matched_correspondents = match_correspondents(document, self._get_classifier())
if matched_correspondents:
correspondent = matched_correspondents[0]
confidence = 0.85
logger.debug(
f"Detected correspondent: {correspondent.name} "
f"(confidence: {confidence})"
)
return (correspondent.id, confidence)
# Try to match based on NER organizations
if entities.get("organizations"):
org_name = entities["organizations"][0]["text"]
# Try to find existing correspondent with similar name
correspondents = Correspondent.objects.filter(
name__icontains=org_name[:20] # First 20 chars
)
if correspondents.exists():
correspondent = correspondents.first()
confidence = 0.70
logger.debug(
f"Detected correspondent from NER: {correspondent.name} "
f"(confidence: {confidence})"
)
return (correspondent.id, confidence)
except Exception as e:
logger.error(f"Correspondent detection failed: {e}", exc_info=True)
return None
def _classify_document_type(
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> Optional[Tuple[int, float]]:
"""
Classify document type using ML and content analysis.
Returns:
(document_type_id, confidence) or None
"""
from documents.models import DocumentType
from documents.matching import match_document_types
try:
# Use existing matching logic
matched_types = match_document_types(document, self._get_classifier())
if matched_types:
doc_type = matched_types[0]
confidence = 0.85
logger.debug(
f"Classified document type: {doc_type.name} "
f"(confidence: {confidence})"
)
return (doc_type.id, confidence)
# ML-based classification if available
classifier = self._get_classifier()
if classifier and hasattr(classifier, "predict"):
# This would need a trained model with document type labels
# For now, fall back to pattern matching
pass
except Exception as e:
logger.error(f"Document type classification failed: {e}", exc_info=True)
return None
def _suggest_storage_path(
self,
document: Document,
text: str,
scan_result: AIScanResult,
) -> Optional[Tuple[int, float]]:
"""
Suggest appropriate storage path based on document characteristics.
Returns:
(storage_path_id, confidence) or None
"""
from documents.models import StoragePath
from documents.matching import match_storage_paths
try:
# Use existing matching logic
matched_paths = match_storage_paths(document, self._get_classifier())
if matched_paths:
storage_path = matched_paths[0]
confidence = 0.80
logger.debug(
f"Suggested storage path: {storage_path.name} "
f"(confidence: {confidence})"
)
return (storage_path.id, confidence)
except Exception as e:
logger.error(f"Storage path suggestion failed: {e}", exc_info=True)
return None
def _extract_custom_fields(
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> Dict[int, Tuple[Any, float]]:
"""
Extract values for custom fields using NER and pattern matching.
Returns:
Dictionary mapping field_id to (value, confidence)
"""
from documents.models import CustomField
extracted_fields = {}
try:
custom_fields = CustomField.objects.all()
for field in custom_fields:
# Try to extract field value based on field name and type
value, confidence = self._extract_field_value(
field, text, entities
)
if value is not None and confidence >= self.suggest_threshold:
extracted_fields[field.id] = (value, confidence)
logger.debug(
f"Extracted custom field '{field.name}': {value} "
f"(confidence: {confidence})"
)
except Exception as e:
logger.error(f"Custom field extraction failed: {e}", exc_info=True)
return extracted_fields
def _extract_field_value(
self,
field: CustomField,
text: str,
entities: Dict[str, Any],
) -> Tuple[Any, float]:
"""
Extract a single custom field value.
Returns:
(value, confidence) tuple
"""
field_name_lower = field.name.lower()
# Date fields
if "date" in field_name_lower:
dates = entities.get("dates", [])
if dates:
return (dates[0]["text"], 0.75)
# Amount/price fields
if any(keyword in field_name_lower for keyword in ["amount", "price", "cost", "total"]):
amounts = entities.get("amounts", [])
if amounts:
return (amounts[0]["text"], 0.75)
# Invoice number fields
if "invoice" in field_name_lower:
invoice_numbers = entities.get("invoice_numbers", [])
if invoice_numbers:
return (invoice_numbers[0], 0.80)
# Email fields
if "email" in field_name_lower:
emails = entities.get("emails", [])
if emails:
return (emails[0], 0.85)
# Phone fields
if "phone" in field_name_lower:
phones = entities.get("phones", [])
if phones:
return (phones[0], 0.85)
# Person name fields
if "name" in field_name_lower or "person" in field_name_lower:
persons = entities.get("persons", [])
if persons:
return (persons[0]["text"], 0.70)
# Organization fields
if "company" in field_name_lower or "organization" in field_name_lower:
orgs = entities.get("organizations", [])
if orgs:
return (orgs[0]["text"], 0.70)
return (None, 0.0)
def _suggest_workflows(
self,
document: Document,
text: str,
scan_result: AIScanResult,
) -> List[Tuple[int, float]]:
"""
Suggest relevant workflows based on document characteristics.
Returns:
List of (workflow_id, confidence) tuples
"""
from documents.models import Workflow, WorkflowTrigger
suggestions = []
try:
# Get all workflows with consumption triggers
workflows = Workflow.objects.filter(
enabled=True,
triggers__type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
).distinct()
for workflow in workflows:
# Evaluate workflow conditions against scan results
confidence = self._evaluate_workflow_match(
workflow, document, scan_result
)
if confidence >= self.suggest_threshold:
suggestions.append((workflow.id, confidence))
logger.debug(
f"Suggested workflow: {workflow.name} "
f"(confidence: {confidence})"
)
except Exception as e:
logger.error(f"Workflow suggestion failed: {e}", exc_info=True)
return suggestions
def _evaluate_workflow_match(
self,
workflow: Workflow,
document: Document,
scan_result: AIScanResult,
) -> float:
"""
Evaluate how well a workflow matches the document.
Returns:
Confidence score (0.0 to 1.0)
"""
# This is a simplified evaluation
# In practice, you'd check workflow triggers and conditions
confidence = 0.5 # Base confidence
# Increase confidence if document type matches workflow expectations
if scan_result.document_type and workflow.actions.exists():
confidence += 0.2
# Increase confidence if correspondent matches
if scan_result.correspondent:
confidence += 0.15
# Increase confidence if tags match
if scan_result.tags:
confidence += 0.15
return min(confidence, 1.0)
def _suggest_title(
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> Optional[str]:
"""
Generate an improved title suggestion based on document content.
Returns:
Suggested title or None
"""
try:
# Extract key information for title
title_parts = []
# Add document type if detected
if entities.get("document_type"):
title_parts.append(entities["document_type"])
# Add primary organization
orgs = entities.get("organizations", [])
if orgs:
title_parts.append(orgs[0]["text"][:30]) # Limit length
# Add date if available
dates = entities.get("dates", [])
if dates:
title_parts.append(dates[0]["text"])
if title_parts:
suggested_title = " - ".join(title_parts)
logger.debug(f"Generated title suggestion: {suggested_title}")
return suggested_title[:127] # Respect title length limit
except Exception as e:
logger.error(f"Title suggestion failed: {e}", exc_info=True)
return None
def _extract_tables(self, file_path: str) -> List[Dict[str, Any]]:
"""
Extract tables from document using advanced OCR.
Returns:
List of extracted tables with data and metadata
"""
extractor = self._get_table_extractor()
if not extractor:
return []
try:
tables = extractor.extract_tables_from_image(file_path)
logger.debug(f"Extracted {len(tables)} tables from document")
return tables
except Exception as e:
logger.error(f"Table extraction failed: {e}", exc_info=True)
return []
def apply_scan_results(
self,
document: Document,
scan_result: AIScanResult,
auto_apply: bool = True,
user_confirmed: bool = False,
) -> Dict[str, Any]:
"""
Apply AI scan results to document.
Args:
document: Document to update
scan_result: AI scan results
auto_apply: Whether to auto-apply high confidence suggestions
user_confirmed: Whether user has confirmed low-confidence changes
Returns:
Dictionary with applied changes and pending suggestions
"""
from documents.models import Tag, Correspondent, DocumentType, StoragePath
applied = {
"tags": [],
"correspondent": None,
"document_type": None,
"storage_path": None,
"custom_fields": {},
}
suggestions = {
"tags": [],
"correspondent": None,
"document_type": None,
"storage_path": None,
"custom_fields": {},
}
applied_fields = [] # Track which fields were auto-applied for webhook
try:
with transaction.atomic():
# Apply tags
for tag_id, confidence in scan_result.tags:
if confidence >= self.auto_apply_threshold and auto_apply:
tag = Tag.objects.get(pk=tag_id)
document.add_nested_tags([tag])
applied["tags"].append({"id": tag_id, "name": tag.name})
applied_fields.append("tags")
logger.info(f"Auto-applied tag: {tag.name}")
elif confidence >= self.suggest_threshold:
tag = Tag.objects.get(pk=tag_id)
suggestions["tags"].append({
"id": tag_id,
"name": tag.name,
"confidence": confidence,
})
# Apply correspondent
if scan_result.correspondent:
corr_id, confidence = scan_result.correspondent
if confidence >= self.auto_apply_threshold and auto_apply:
correspondent = Correspondent.objects.get(pk=corr_id)
document.correspondent = correspondent
applied["correspondent"] = {
"id": corr_id,
"name": correspondent.name,
}
applied_fields.append("correspondent")
logger.info(f"Auto-applied correspondent: {correspondent.name}")
elif confidence >= self.suggest_threshold:
correspondent = Correspondent.objects.get(pk=corr_id)
suggestions["correspondent"] = {
"id": corr_id,
"name": correspondent.name,
"confidence": confidence,
}
# Apply document type
if scan_result.document_type:
type_id, confidence = scan_result.document_type
if confidence >= self.auto_apply_threshold and auto_apply:
doc_type = DocumentType.objects.get(pk=type_id)
document.document_type = doc_type
applied["document_type"] = {
"id": type_id,
"name": doc_type.name,
}
applied_fields.append("document_type")
logger.info(f"Auto-applied document type: {doc_type.name}")
elif confidence >= self.suggest_threshold:
doc_type = DocumentType.objects.get(pk=type_id)
suggestions["document_type"] = {
"id": type_id,
"name": doc_type.name,
"confidence": confidence,
}
# Apply storage path
if scan_result.storage_path:
path_id, confidence = scan_result.storage_path
if confidence >= self.auto_apply_threshold and auto_apply:
storage_path = StoragePath.objects.get(pk=path_id)
document.storage_path = storage_path
applied["storage_path"] = {
"id": path_id,
"name": storage_path.name,
}
applied_fields.append("storage_path")
logger.info(f"Auto-applied storage path: {storage_path.name}")
elif confidence >= self.suggest_threshold:
storage_path = StoragePath.objects.get(pk=path_id)
suggestions["storage_path"] = {
"id": path_id,
"name": storage_path.name,
"confidence": confidence,
}
# Save document with changes
document.save()
# Send webhooks for auto-applied suggestions
if applied_fields:
try:
from documents.webhooks import send_suggestion_applied_webhook
send_suggestion_applied_webhook(
document,
scan_result.to_dict(),
applied_fields,
)
except Exception as webhook_error:
logger.warning(
f"Failed to send suggestion applied webhook: {webhook_error}",
exc_info=True,
)
# Send webhook for scan completion
try:
from documents.webhooks import send_scan_completed_webhook
auto_applied_count = len(applied_fields)
suggestions_count = sum([
len(suggestions.get("tags", [])),
1 if suggestions.get("correspondent") else 0,
1 if suggestions.get("document_type") else 0,
1 if suggestions.get("storage_path") else 0,
])
send_scan_completed_webhook(
document,
scan_result.to_dict(),
auto_applied_count,
suggestions_count,
)
except Exception as webhook_error:
logger.warning(
f"Failed to send scan completed webhook: {webhook_error}",
exc_info=True,
)
except Exception as e:
logger.error(f"Failed to apply scan results: {e}", exc_info=True)
return {
"applied": applied,
"suggestions": suggestions,
}
# Global scanner instance (lazy initialized)
_scanner_instance = None
def get_ai_scanner() -> AIDocumentScanner:
"""
Get or create the global AI scanner instance.
Returns:
AIDocumentScanner instance
"""
global _scanner_instance
if _scanner_instance is None:
_scanner_instance = AIDocumentScanner()
return _scanner_instance