Initial exploration: AI Scanner linting and pre-commit hooks

Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2025-11-12 13:09:03 +00:00
parent 496a9e7b7b
commit 2d7345f0bc
3 changed files with 308 additions and 268 deletions

View file

@ -14,15 +14,9 @@ According to agents.md requirements:
from __future__ import annotations from __future__ import annotations
import logging import logging
from datetime import datetime from typing import Any
from typing import TYPE_CHECKING, Dict, List, Optional, Any
from django.conf import settings
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.utils import timezone
if TYPE_CHECKING:
from documents.models import Document, DeletionRequest
logger = logging.getLogger("paperless.ai_deletion") logger = logging.getLogger("paperless.ai_deletion")
@ -30,35 +24,35 @@ logger = logging.getLogger("paperless.ai_deletion")
class AIDeletionManager: class AIDeletionManager:
""" """
Manager for AI-initiated deletion requests. Manager for AI-initiated deletion requests.
Ensures all deletions go through proper user approval workflow. Ensures all deletions go through proper user approval workflow.
""" """
@staticmethod @staticmethod
def create_deletion_request( def create_deletion_request(
documents: List, documents: list,
reason: str, reason: str,
user: User, user: User,
impact_analysis: Optional[Dict[str, Any]] = None, impact_analysis: dict[str, Any] | None = None,
): ):
""" """
Create a new deletion request that requires user approval. Create a new deletion request that requires user approval.
Args: Args:
documents: List of documents to be deleted documents: List of documents to be deleted
reason: Detailed explanation from AI reason: Detailed explanation from AI
user: User who must approve user: User who must approve
impact_analysis: Optional detailed impact analysis impact_analysis: Optional detailed impact analysis
Returns: Returns:
Created DeletionRequest instance Created DeletionRequest instance
""" """
from documents.models import DeletionRequest from documents.models import DeletionRequest
# Analyze impact if not provided # Analyze impact if not provided
if impact_analysis is None: if impact_analysis is None:
impact_analysis = AIDeletionManager._analyze_impact(documents) impact_analysis = AIDeletionManager._analyze_impact(documents)
# Create request # Create request
request = DeletionRequest.objects.create( request = DeletionRequest.objects.create(
requested_by_ai=True, requested_by_ai=True,
@ -67,25 +61,25 @@ class AIDeletionManager:
status=DeletionRequest.STATUS_PENDING, status=DeletionRequest.STATUS_PENDING,
impact_summary=impact_analysis, impact_summary=impact_analysis,
) )
# Add documents # Add documents
request.documents.set(documents) request.documents.set(documents)
logger.info( logger.info(
f"Created deletion request {request.id} for {len(documents)} documents " f"Created deletion request {request.id} for {len(documents)} documents "
f"requiring approval from user {user.username}" f"requiring approval from user {user.username}",
) )
# TODO: Send notification to user about pending deletion request # TODO: Send notification to user about pending deletion request
# This could be via email, in-app notification, or both # This could be via email, in-app notification, or both
return request return request
@staticmethod @staticmethod
def _analyze_impact(documents: List) -> Dict[str, Any]: def _analyze_impact(documents: list) -> dict[str, Any]:
""" """
Analyze the impact of deleting the given documents. Analyze the impact of deleting the given documents.
Returns comprehensive information about what will be affected. Returns comprehensive information about what will be affected.
""" """
impact = { impact = {
@ -100,7 +94,7 @@ class AIDeletionManager:
"latest": None, "latest": None,
}, },
} }
for doc in documents: for doc in documents:
# Document details # Document details
doc_info = { doc_info = {
@ -112,77 +106,85 @@ class AIDeletionManager:
"tags": [tag.name for tag in doc.tags.all()], "tags": [tag.name for tag in doc.tags.all()],
} }
impact["documents"].append(doc_info) impact["documents"].append(doc_info)
# Track size (if available) # Track size (if available)
# Note: This would need actual file size tracking # Note: This would need actual file size tracking
# Track affected metadata # Track affected metadata
if doc.correspondent: if doc.correspondent:
impact["affected_correspondents"].add(doc.correspondent.name) impact["affected_correspondents"].add(doc.correspondent.name)
if doc.document_type: if doc.document_type:
impact["affected_types"].add(doc.document_type.name) impact["affected_types"].add(doc.document_type.name)
for tag in doc.tags.all(): for tag in doc.tags.all():
impact["affected_tags"].add(tag.name) impact["affected_tags"].add(tag.name)
# Track date range # Track date range
if doc.created: if doc.created:
if impact["date_range"]["earliest"] is None or doc.created < impact["date_range"]["earliest"]: if (
impact["date_range"]["earliest"] is None
or doc.created < impact["date_range"]["earliest"]
):
impact["date_range"]["earliest"] = doc.created impact["date_range"]["earliest"] = doc.created
if impact["date_range"]["latest"] is None or doc.created > impact["date_range"]["latest"]: if (
impact["date_range"]["latest"] is None
or doc.created > impact["date_range"]["latest"]
):
impact["date_range"]["latest"] = doc.created impact["date_range"]["latest"] = doc.created
# Convert sets to lists for JSON serialization # Convert sets to lists for JSON serialization
impact["affected_tags"] = list(impact["affected_tags"]) impact["affected_tags"] = list(impact["affected_tags"])
impact["affected_correspondents"] = list(impact["affected_correspondents"]) impact["affected_correspondents"] = list(impact["affected_correspondents"])
impact["affected_types"] = list(impact["affected_types"]) impact["affected_types"] = list(impact["affected_types"])
# Convert dates to ISO format # Convert dates to ISO format
if impact["date_range"]["earliest"]: if impact["date_range"]["earliest"]:
impact["date_range"]["earliest"] = impact["date_range"]["earliest"].isoformat() impact["date_range"]["earliest"] = impact["date_range"][
"earliest"
].isoformat()
if impact["date_range"]["latest"]: if impact["date_range"]["latest"]:
impact["date_range"]["latest"] = impact["date_range"]["latest"].isoformat() impact["date_range"]["latest"] = impact["date_range"]["latest"].isoformat()
return impact return impact
@staticmethod @staticmethod
def get_pending_requests(user: User) -> List: def get_pending_requests(user: User) -> list:
""" """
Get all pending deletion requests for a user. Get all pending deletion requests for a user.
Args: Args:
user: User to get requests for user: User to get requests for
Returns: Returns:
List of pending DeletionRequest instances List of pending DeletionRequest instances
""" """
from documents.models import DeletionRequest from documents.models import DeletionRequest
return list( return list(
DeletionRequest.objects.filter( DeletionRequest.objects.filter(
user=user, user=user,
status=DeletionRequest.STATUS_PENDING, status=DeletionRequest.STATUS_PENDING,
) ),
) )
@staticmethod @staticmethod
def format_deletion_request_for_user(request) -> str: def format_deletion_request_for_user(request) -> str:
""" """
Format a deletion request into a human-readable message. Format a deletion request into a human-readable message.
This provides comprehensive information to the user about what This provides comprehensive information to the user about what
will be deleted, as required by agents.md. will be deleted, as required by agents.md.
Args: Args:
request: DeletionRequest to format request: DeletionRequest to format
Returns: Returns:
Formatted message string Formatted message string
""" """
impact = request.impact_summary impact = request.impact_summary
message = f""" message = f"""
=========================================== ===========================================
AI DELETION REQUEST #{request.id} AI DELETION REQUEST #{request.id}
@ -192,27 +194,27 @@ REASON:
{request.ai_reason} {request.ai_reason}
IMPACT SUMMARY: IMPACT SUMMARY:
- Number of documents: {impact.get('document_count', 0)} - Number of documents: {impact.get("document_count", 0)}
- Affected tags: {', '.join(impact.get('affected_tags', [])) or 'None'} - Affected tags: {", ".join(impact.get("affected_tags", [])) or "None"}
- Affected correspondents: {', '.join(impact.get('affected_correspondents', [])) or 'None'} - Affected correspondents: {", ".join(impact.get("affected_correspondents", [])) or "None"}
- Affected document types: {', '.join(impact.get('affected_types', [])) or 'None'} - Affected document types: {", ".join(impact.get("affected_types", [])) or "None"}
DATE RANGE: DATE RANGE:
- Earliest: {impact.get('date_range', {}).get('earliest', 'Unknown')} - Earliest: {impact.get("date_range", {}).get("earliest", "Unknown")}
- Latest: {impact.get('date_range', {}).get('latest', 'Unknown')} - Latest: {impact.get("date_range", {}).get("latest", "Unknown")}
DOCUMENTS TO BE DELETED: DOCUMENTS TO BE DELETED:
""" """
for i, doc in enumerate(impact.get('documents', []), 1): for i, doc in enumerate(impact.get("documents", []), 1):
message += f""" message += f"""
{i}. ID: {doc['id']} - {doc['title']} {i}. ID: {doc["id"]} - {doc["title"]}
Created: {doc['created']} Created: {doc["created"]}
Correspondent: {doc['correspondent'] or 'None'} Correspondent: {doc["correspondent"] or "None"}
Type: {doc['document_type'] or 'None'} Type: {doc["document_type"] or "None"}
Tags: {', '.join(doc['tags']) or 'None'} Tags: {", ".join(doc["tags"]) or "None"}
""" """
message += """ message += """
=========================================== ===========================================
@ -223,21 +225,21 @@ No files will be deleted until you confirm this action.
Please review the above information carefully before Please review the above information carefully before
approving or rejecting this request. approving or rejecting this request.
""" """
return message return message
@staticmethod @staticmethod
def can_ai_delete_automatically() -> bool: def can_ai_delete_automatically() -> bool:
""" """
Check if AI is allowed to delete automatically. Check if AI is allowed to delete automatically.
According to agents.md, AI should NEVER delete without user approval. According to agents.md, AI should NEVER delete without user approval.
This method always returns False as a safety measure. This method always returns False as a safety measure.
Returns: Returns:
Always False - AI cannot auto-delete Always False - AI cannot auto-delete
""" """
return False return False
__all__ = ['AIDeletionManager'] __all__ = ["AIDeletionManager"]

View file

@ -20,21 +20,16 @@ According to agents.md requirements:
from __future__ import annotations from __future__ import annotations
import logging import logging
from typing import TYPE_CHECKING, Dict, List, Optional, Any, Tuple from typing import TYPE_CHECKING
from typing import Any
from django.conf import settings from django.conf import settings
from django.db import transaction from django.db import transaction
if TYPE_CHECKING: if TYPE_CHECKING:
from documents.models import ( from documents.models import CustomField
Document, from documents.models import Document
Tag, from documents.models import Workflow
Correspondent,
DocumentType,
StoragePath,
CustomField,
Workflow,
)
logger = logging.getLogger("paperless.ai_scanner") logger = logging.getLogger("paperless.ai_scanner")
@ -45,17 +40,25 @@ class AIScanResult:
""" """
def __init__(self): def __init__(self):
self.tags: List[Tuple[int, float]] = [] # [(tag_id, confidence), ...] self.tags: list[tuple[int, float]] = [] # [(tag_id, confidence), ...]
self.correspondent: Optional[Tuple[int, float]] = None # (correspondent_id, confidence) self.correspondent: tuple[int, float] | None = (
self.document_type: Optional[Tuple[int, float]] = None # (document_type_id, confidence) None # (correspondent_id, confidence)
self.storage_path: Optional[Tuple[int, float]] = None # (storage_path_id, confidence) )
self.custom_fields: Dict[int, Tuple[Any, float]] = {} # {field_id: (value, confidence), ...} self.document_type: tuple[int, float] | None = (
self.workflows: List[Tuple[int, float]] = [] # [(workflow_id, confidence), ...] None # (document_type_id, confidence)
self.extracted_entities: Dict[str, Any] = {} # NER results )
self.title_suggestion: Optional[str] = None self.storage_path: tuple[int, float] | None = (
self.metadata: Dict[str, Any] = {} # Additional metadata None # (storage_path_id, confidence)
)
self.custom_fields: dict[
int, tuple[Any, float],
] = {} # {field_id: (value, confidence), ...}
self.workflows: list[tuple[int, float]] = [] # [(workflow_id, confidence), ...]
self.extracted_entities: dict[str, Any] = {} # NER results
self.title_suggestion: str | None = None
self.metadata: dict[str, Any] = {} # Additional metadata
def to_dict(self) -> Dict[str, Any]: def to_dict(self) -> dict[str, Any]:
"""Convert scan results to dictionary for logging/serialization.""" """Convert scan results to dictionary for logging/serialization."""
return { return {
"tags": self.tags, "tags": self.tags,
@ -73,7 +76,7 @@ class AIScanResult:
class AIDocumentScanner: class AIDocumentScanner:
""" """
Comprehensive AI scanner for automatic document metadata management. Comprehensive AI scanner for automatic document metadata management.
This scanner integrates all ML/AI capabilities to provide automatic: This scanner integrates all ML/AI capabilities to provide automatic:
- Tag assignment based on content analysis - Tag assignment based on content analysis
- Correspondent detection from document text - Correspondent detection from document text
@ -81,7 +84,7 @@ class AIDocumentScanner:
- Storage path suggestion based on content/type - Storage path suggestion based on content/type
- Custom field extraction using NER - Custom field extraction using NER
- Workflow assignment based on document characteristics - Workflow assignment based on document characteristics
Features: Features:
- High confidence threshold (>80%) for automatic application - High confidence threshold (>80%) for automatic application
- Medium confidence (60-80%) for suggestions requiring user review - Medium confidence (60-80%) for suggestions requiring user review
@ -99,7 +102,7 @@ class AIDocumentScanner:
): ):
""" """
Initialize AI scanner. Initialize AI scanner.
Args: Args:
auto_apply_threshold: Confidence threshold for automatic application (default: 0.80) auto_apply_threshold: Confidence threshold for automatic application (default: 0.80)
suggest_threshold: Confidence threshold for suggestions (default: 0.60) suggest_threshold: Confidence threshold for suggestions (default: 0.60)
@ -108,7 +111,7 @@ class AIDocumentScanner:
""" """
self.auto_apply_threshold = auto_apply_threshold self.auto_apply_threshold = auto_apply_threshold
self.suggest_threshold = suggest_threshold self.suggest_threshold = suggest_threshold
# Check settings for ML/OCR enablement # Check settings for ML/OCR enablement
self.ml_enabled = ( self.ml_enabled = (
enable_ml_features enable_ml_features
@ -120,16 +123,16 @@ class AIDocumentScanner:
if enable_advanced_ocr is not None if enable_advanced_ocr is not None
else getattr(settings, "PAPERLESS_ENABLE_ADVANCED_OCR", True) else getattr(settings, "PAPERLESS_ENABLE_ADVANCED_OCR", True)
) )
# Lazy loading of ML components # Lazy loading of ML components
self._classifier = None self._classifier = None
self._ner_extractor = None self._ner_extractor = None
self._semantic_search = None self._semantic_search = None
self._table_extractor = None self._table_extractor = None
logger.info( logger.info(
f"AIDocumentScanner initialized - ML: {self.ml_enabled}, " f"AIDocumentScanner initialized - ML: {self.ml_enabled}, "
f"Advanced OCR: {self.advanced_ocr_enabled}" f"Advanced OCR: {self.advanced_ocr_enabled}",
) )
def _get_classifier(self): def _get_classifier(self):
@ -137,6 +140,7 @@ class AIDocumentScanner:
if self._classifier is None and self.ml_enabled: if self._classifier is None and self.ml_enabled:
try: try:
from documents.ml.classifier import TransformerDocumentClassifier from documents.ml.classifier import TransformerDocumentClassifier
self._classifier = TransformerDocumentClassifier() self._classifier = TransformerDocumentClassifier()
logger.info("ML classifier loaded successfully") logger.info("ML classifier loaded successfully")
except Exception as e: except Exception as e:
@ -149,6 +153,7 @@ class AIDocumentScanner:
if self._ner_extractor is None and self.ml_enabled: if self._ner_extractor is None and self.ml_enabled:
try: try:
from documents.ml.ner import DocumentNER from documents.ml.ner import DocumentNER
self._ner_extractor = DocumentNER() self._ner_extractor = DocumentNER()
logger.info("NER extractor loaded successfully") logger.info("NER extractor loaded successfully")
except Exception as e: except Exception as e:
@ -160,6 +165,7 @@ class AIDocumentScanner:
if self._semantic_search is None and self.ml_enabled: if self._semantic_search is None and self.ml_enabled:
try: try:
from documents.ml.semantic_search import SemanticSearch from documents.ml.semantic_search import SemanticSearch
self._semantic_search = SemanticSearch() self._semantic_search = SemanticSearch()
logger.info("Semantic search loaded successfully") logger.info("Semantic search loaded successfully")
except Exception as e: except Exception as e:
@ -171,6 +177,7 @@ class AIDocumentScanner:
if self._table_extractor is None and self.advanced_ocr_enabled: if self._table_extractor is None and self.advanced_ocr_enabled:
try: try:
from documents.ocr.table_extractor import TableExtractor from documents.ocr.table_extractor import TableExtractor
self._table_extractor = TableExtractor() self._table_extractor = TableExtractor()
logger.info("Table extractor loaded successfully") logger.info("Table extractor loaded successfully")
except Exception as e: except Exception as e:
@ -185,90 +192,108 @@ class AIDocumentScanner:
) -> AIScanResult: ) -> AIScanResult:
""" """
Perform comprehensive AI scan of a document. Perform comprehensive AI scan of a document.
This is the main entry point for document scanning. It orchestrates This is the main entry point for document scanning. It orchestrates
all AI/ML components to analyze the document and generate suggestions. all AI/ML components to analyze the document and generate suggestions.
Args: Args:
document: The Document model instance document: The Document model instance
document_text: The extracted text content document_text: The extracted text content
original_file_path: Path to original file (for OCR/image analysis) original_file_path: Path to original file (for OCR/image analysis)
Returns: Returns:
AIScanResult containing all suggestions and extracted data AIScanResult containing all suggestions and extracted data
""" """
logger.info(f"Starting AI scan for document: {document.title} (ID: {document.pk})") logger.info(
f"Starting AI scan for document: {document.title} (ID: {document.pk})",
)
result = AIScanResult() result = AIScanResult()
# Extract entities using NER # Extract entities using NER
result.extracted_entities = self._extract_entities(document_text) result.extracted_entities = self._extract_entities(document_text)
# Analyze and suggest tags # Analyze and suggest tags
result.tags = self._suggest_tags(document, document_text, result.extracted_entities) result.tags = self._suggest_tags(
document, document_text, result.extracted_entities,
)
# Detect correspondent # Detect correspondent
result.correspondent = self._detect_correspondent( result.correspondent = self._detect_correspondent(
document, document_text, result.extracted_entities document,
document_text,
result.extracted_entities,
) )
# Classify document type # Classify document type
result.document_type = self._classify_document_type( result.document_type = self._classify_document_type(
document, document_text, result.extracted_entities document,
document_text,
result.extracted_entities,
) )
# Suggest storage path # Suggest storage path
result.storage_path = self._suggest_storage_path( result.storage_path = self._suggest_storage_path(
document, document_text, result document,
document_text,
result,
) )
# Extract custom fields # Extract custom fields
result.custom_fields = self._extract_custom_fields( result.custom_fields = self._extract_custom_fields(
document, document_text, result.extracted_entities document,
document_text,
result.extracted_entities,
) )
# Suggest workflows # Suggest workflows
result.workflows = self._suggest_workflows(document, document_text, result) result.workflows = self._suggest_workflows(document, document_text, result)
# Generate improved title suggestion # Generate improved title suggestion
result.title_suggestion = self._suggest_title( result.title_suggestion = self._suggest_title(
document, document_text, result.extracted_entities document,
document_text,
result.extracted_entities,
) )
# Extract tables if advanced OCR enabled # Extract tables if advanced OCR enabled
if self.advanced_ocr_enabled and original_file_path: if self.advanced_ocr_enabled and original_file_path:
result.metadata["tables"] = self._extract_tables(original_file_path) result.metadata["tables"] = self._extract_tables(original_file_path)
logger.info(f"AI scan completed for document {document.pk}") logger.info(f"AI scan completed for document {document.pk}")
logger.debug(f"Scan results: {result.to_dict()}") logger.debug(f"Scan results: {result.to_dict()}")
return result return result
def _extract_entities(self, text: str) -> Dict[str, Any]: def _extract_entities(self, text: str) -> dict[str, Any]:
""" """
Extract named entities from document text using NER. Extract named entities from document text using NER.
Returns: Returns:
Dictionary with extracted entities (persons, orgs, dates, amounts, etc.) Dictionary with extracted entities (persons, orgs, dates, amounts, etc.)
""" """
ner = self._get_ner_extractor() ner = self._get_ner_extractor()
if not ner: if not ner:
return {} return {}
try: try:
# Use extract_all to get comprehensive entity extraction # Use extract_all to get comprehensive entity extraction
entities = ner.extract_all(text) entities = ner.extract_all(text)
# Convert string lists to dict format for consistency # Convert string lists to dict format for consistency
for key in ["persons", "organizations", "locations", "misc"]: for key in ["persons", "organizations", "locations", "misc"]:
if key in entities and isinstance(entities[key], list): if key in entities and isinstance(entities[key], list):
entities[key] = [{"text": e} if isinstance(e, str) else e for e in entities[key]] entities[key] = [
{"text": e} if isinstance(e, str) else e for e in entities[key]
]
for key in ["dates", "amounts"]: for key in ["dates", "amounts"]:
if key in entities and isinstance(entities[key], list): if key in entities and isinstance(entities[key], list):
entities[key] = [{"text": e} if isinstance(e, str) else e for e in entities[key]] entities[key] = [
{"text": e} if isinstance(e, str) else e for e in entities[key]
logger.debug(f"Extracted entities from NER") ]
logger.debug("Extracted entities from NER")
return entities return entities
except Exception as e: except Exception as e:
logger.error(f"Entity extraction failed: {e}", exc_info=True) logger.error(f"Entity extraction failed: {e}", exc_info=True)
@ -278,156 +303,157 @@ class AIDocumentScanner:
self, self,
document: Document, document: Document,
text: str, text: str,
entities: Dict[str, Any], entities: dict[str, Any],
) -> List[Tuple[int, float]]: ) -> list[tuple[int, float]]:
""" """
Suggest relevant tags based on document content and entities. Suggest relevant tags based on document content and entities.
Uses a combination of: Uses a combination of:
- Keyword matching with existing tag patterns - Keyword matching with existing tag patterns
- ML classification if available - ML classification if available
- Entity-based suggestions (e.g., organization -> company tag) - Entity-based suggestions (e.g., organization -> company tag)
Returns: Returns:
List of (tag_id, confidence) tuples List of (tag_id, confidence) tuples
""" """
from documents.models import Tag
from documents.matching import match_tags from documents.matching import match_tags
from documents.models import Tag
suggestions = [] suggestions = []
try: try:
# Use existing matching logic # Use existing matching logic
matched_tags = match_tags(document, self._get_classifier()) matched_tags = match_tags(document, self._get_classifier())
# Add confidence scores based on matching strength # Add confidence scores based on matching strength
for tag in matched_tags: for tag in matched_tags:
confidence = 0.85 # High confidence for matched tags confidence = 0.85 # High confidence for matched tags
suggestions.append((tag.id, confidence)) suggestions.append((tag.id, confidence))
# Additional entity-based suggestions # Additional entity-based suggestions
if entities: if entities:
# Suggest tags based on detected entities # Suggest tags based on detected entities
all_tags = Tag.objects.all() all_tags = Tag.objects.all()
# Check for organization entities -> company/business tags # Check for organization entities -> company/business tags
if entities.get("organizations"): if entities.get("organizations"):
for tag in all_tags.filter(name__icontains="company"): for tag in all_tags.filter(name__icontains="company"):
suggestions.append((tag.id, 0.70)) suggestions.append((tag.id, 0.70))
# Check for date entities -> tax/financial tags if year-end # Check for date entities -> tax/financial tags if year-end
if entities.get("dates"): if entities.get("dates"):
for tag in all_tags.filter(name__icontains="tax"): for tag in all_tags.filter(name__icontains="tax"):
suggestions.append((tag.id, 0.65)) suggestions.append((tag.id, 0.65))
# Remove duplicates, keep highest confidence # Remove duplicates, keep highest confidence
seen = {} seen = {}
for tag_id, conf in suggestions: for tag_id, conf in suggestions:
if tag_id not in seen or conf > seen[tag_id]: if tag_id not in seen or conf > seen[tag_id]:
seen[tag_id] = conf seen[tag_id] = conf
suggestions = [(tid, conf) for tid, conf in seen.items()] suggestions = [(tid, conf) for tid, conf in seen.items()]
suggestions.sort(key=lambda x: x[1], reverse=True) suggestions.sort(key=lambda x: x[1], reverse=True)
logger.debug(f"Suggested {len(suggestions)} tags") logger.debug(f"Suggested {len(suggestions)} tags")
except Exception as e: except Exception as e:
logger.error(f"Tag suggestion failed: {e}", exc_info=True) logger.error(f"Tag suggestion failed: {e}", exc_info=True)
return suggestions return suggestions
def _detect_correspondent( def _detect_correspondent(
self, self,
document: Document, document: Document,
text: str, text: str,
entities: Dict[str, Any], entities: dict[str, Any],
) -> Optional[Tuple[int, float]]: ) -> tuple[int, float] | None:
""" """
Detect correspondent based on document content and entities. Detect correspondent based on document content and entities.
Uses: Uses:
- Organization entities from NER - Organization entities from NER
- Email domains - Email domains
- Existing correspondent matching patterns - Existing correspondent matching patterns
Returns: Returns:
(correspondent_id, confidence) or None (correspondent_id, confidence) or None
""" """
from documents.models import Correspondent
from documents.matching import match_correspondents from documents.matching import match_correspondents
from documents.models import Correspondent
try: try:
# Use existing matching logic # Use existing matching logic
matched_correspondents = match_correspondents(document, self._get_classifier()) matched_correspondents = match_correspondents(
document, self._get_classifier(),
)
if matched_correspondents: if matched_correspondents:
correspondent = matched_correspondents[0] correspondent = matched_correspondents[0]
confidence = 0.85 confidence = 0.85
logger.debug( logger.debug(
f"Detected correspondent: {correspondent.name} " f"Detected correspondent: {correspondent.name} "
f"(confidence: {confidence})" f"(confidence: {confidence})",
) )
return (correspondent.id, confidence) return (correspondent.id, confidence)
# Try to match based on NER organizations # Try to match based on NER organizations
if entities.get("organizations"): if entities.get("organizations"):
org_name = entities["organizations"][0]["text"] org_name = entities["organizations"][0]["text"]
# Try to find existing correspondent with similar name # Try to find existing correspondent with similar name
correspondents = Correspondent.objects.filter( correspondents = Correspondent.objects.filter(
name__icontains=org_name[:20] # First 20 chars name__icontains=org_name[:20], # First 20 chars
) )
if correspondents.exists(): if correspondents.exists():
correspondent = correspondents.first() correspondent = correspondents.first()
confidence = 0.70 confidence = 0.70
logger.debug( logger.debug(
f"Detected correspondent from NER: {correspondent.name} " f"Detected correspondent from NER: {correspondent.name} "
f"(confidence: {confidence})" f"(confidence: {confidence})",
) )
return (correspondent.id, confidence) return (correspondent.id, confidence)
except Exception as e: except Exception as e:
logger.error(f"Correspondent detection failed: {e}", exc_info=True) logger.error(f"Correspondent detection failed: {e}", exc_info=True)
return None return None
def _classify_document_type( def _classify_document_type(
self, self,
document: Document, document: Document,
text: str, text: str,
entities: Dict[str, Any], entities: dict[str, Any],
) -> Optional[Tuple[int, float]]: ) -> tuple[int, float] | None:
""" """
Classify document type using ML and content analysis. Classify document type using ML and content analysis.
Returns: Returns:
(document_type_id, confidence) or None (document_type_id, confidence) or None
""" """
from documents.models import DocumentType
from documents.matching import match_document_types from documents.matching import match_document_types
try: try:
# Use existing matching logic # Use existing matching logic
matched_types = match_document_types(document, self._get_classifier()) matched_types = match_document_types(document, self._get_classifier())
if matched_types: if matched_types:
doc_type = matched_types[0] doc_type = matched_types[0]
confidence = 0.85 confidence = 0.85
logger.debug( logger.debug(
f"Classified document type: {doc_type.name} " f"Classified document type: {doc_type.name} "
f"(confidence: {confidence})" f"(confidence: {confidence})",
) )
return (doc_type.id, confidence) return (doc_type.id, confidence)
# ML-based classification if available # ML-based classification if available
classifier = self._get_classifier() classifier = self._get_classifier()
if classifier and hasattr(classifier, "predict"): if classifier and hasattr(classifier, "predict"):
# This would need a trained model with document type labels # This would need a trained model with document type labels
# For now, fall back to pattern matching # For now, fall back to pattern matching
pass pass
except Exception as e: except Exception as e:
logger.error(f"Document type classification failed: {e}", exc_info=True) logger.error(f"Document type classification failed: {e}", exc_info=True)
return None return None
def _suggest_storage_path( def _suggest_storage_path(
@ -435,127 +461,131 @@ class AIDocumentScanner:
document: Document, document: Document,
text: str, text: str,
scan_result: AIScanResult, scan_result: AIScanResult,
) -> Optional[Tuple[int, float]]: ) -> tuple[int, float] | None:
""" """
Suggest appropriate storage path based on document characteristics. Suggest appropriate storage path based on document characteristics.
Returns: Returns:
(storage_path_id, confidence) or None (storage_path_id, confidence) or None
""" """
from documents.models import StoragePath
from documents.matching import match_storage_paths from documents.matching import match_storage_paths
try: try:
# Use existing matching logic # Use existing matching logic
matched_paths = match_storage_paths(document, self._get_classifier()) matched_paths = match_storage_paths(document, self._get_classifier())
if matched_paths: if matched_paths:
storage_path = matched_paths[0] storage_path = matched_paths[0]
confidence = 0.80 confidence = 0.80
logger.debug( logger.debug(
f"Suggested storage path: {storage_path.name} " f"Suggested storage path: {storage_path.name} "
f"(confidence: {confidence})" f"(confidence: {confidence})",
) )
return (storage_path.id, confidence) return (storage_path.id, confidence)
except Exception as e: except Exception as e:
logger.error(f"Storage path suggestion failed: {e}", exc_info=True) logger.error(f"Storage path suggestion failed: {e}", exc_info=True)
return None return None
def _extract_custom_fields( def _extract_custom_fields(
self, self,
document: Document, document: Document,
text: str, text: str,
entities: Dict[str, Any], entities: dict[str, Any],
) -> Dict[int, Tuple[Any, float]]: ) -> dict[int, tuple[Any, float]]:
""" """
Extract values for custom fields using NER and pattern matching. Extract values for custom fields using NER and pattern matching.
Returns: Returns:
Dictionary mapping field_id to (value, confidence) Dictionary mapping field_id to (value, confidence)
""" """
from documents.models import CustomField from documents.models import CustomField
extracted_fields = {} extracted_fields = {}
try: try:
custom_fields = CustomField.objects.all() custom_fields = CustomField.objects.all()
for field in custom_fields: for field in custom_fields:
# Try to extract field value based on field name and type # Try to extract field value based on field name and type
value, confidence = self._extract_field_value( value, confidence = self._extract_field_value(
field, text, entities field,
text,
entities,
) )
if value is not None and confidence >= self.suggest_threshold: if value is not None and confidence >= self.suggest_threshold:
extracted_fields[field.id] = (value, confidence) extracted_fields[field.id] = (value, confidence)
logger.debug( logger.debug(
f"Extracted custom field '{field.name}': {value} " f"Extracted custom field '{field.name}': {value} "
f"(confidence: {confidence})" f"(confidence: {confidence})",
) )
except Exception as e: except Exception as e:
logger.error(f"Custom field extraction failed: {e}", exc_info=True) logger.error(f"Custom field extraction failed: {e}", exc_info=True)
return extracted_fields return extracted_fields
def _extract_field_value( def _extract_field_value(
self, self,
field: CustomField, field: CustomField,
text: str, text: str,
entities: Dict[str, Any], entities: dict[str, Any],
) -> Tuple[Any, float]: ) -> tuple[Any, float]:
""" """
Extract a single custom field value. Extract a single custom field value.
Returns: Returns:
(value, confidence) tuple (value, confidence) tuple
""" """
field_name_lower = field.name.lower() field_name_lower = field.name.lower()
# Date fields # Date fields
if "date" in field_name_lower: if "date" in field_name_lower:
dates = entities.get("dates", []) dates = entities.get("dates", [])
if dates: if dates:
return (dates[0]["text"], 0.75) return (dates[0]["text"], 0.75)
# Amount/price fields # Amount/price fields
if any(keyword in field_name_lower for keyword in ["amount", "price", "cost", "total"]): if any(
keyword in field_name_lower
for keyword in ["amount", "price", "cost", "total"]
):
amounts = entities.get("amounts", []) amounts = entities.get("amounts", [])
if amounts: if amounts:
return (amounts[0]["text"], 0.75) return (amounts[0]["text"], 0.75)
# Invoice number fields # Invoice number fields
if "invoice" in field_name_lower: if "invoice" in field_name_lower:
invoice_numbers = entities.get("invoice_numbers", []) invoice_numbers = entities.get("invoice_numbers", [])
if invoice_numbers: if invoice_numbers:
return (invoice_numbers[0], 0.80) return (invoice_numbers[0], 0.80)
# Email fields # Email fields
if "email" in field_name_lower: if "email" in field_name_lower:
emails = entities.get("emails", []) emails = entities.get("emails", [])
if emails: if emails:
return (emails[0], 0.85) return (emails[0], 0.85)
# Phone fields # Phone fields
if "phone" in field_name_lower: if "phone" in field_name_lower:
phones = entities.get("phones", []) phones = entities.get("phones", [])
if phones: if phones:
return (phones[0], 0.85) return (phones[0], 0.85)
# Person name fields # Person name fields
if "name" in field_name_lower or "person" in field_name_lower: if "name" in field_name_lower or "person" in field_name_lower:
persons = entities.get("persons", []) persons = entities.get("persons", [])
if persons: if persons:
return (persons[0]["text"], 0.70) return (persons[0]["text"], 0.70)
# Organization fields # Organization fields
if "company" in field_name_lower or "organization" in field_name_lower: if "company" in field_name_lower or "organization" in field_name_lower:
orgs = entities.get("organizations", []) orgs = entities.get("organizations", [])
if orgs: if orgs:
return (orgs[0]["text"], 0.70) return (orgs[0]["text"], 0.70)
return (None, 0.0) return (None, 0.0)
def _suggest_workflows( def _suggest_workflows(
@ -563,40 +593,43 @@ class AIDocumentScanner:
document: Document, document: Document,
text: str, text: str,
scan_result: AIScanResult, scan_result: AIScanResult,
) -> List[Tuple[int, float]]: ) -> list[tuple[int, float]]:
""" """
Suggest relevant workflows based on document characteristics. Suggest relevant workflows based on document characteristics.
Returns: Returns:
List of (workflow_id, confidence) tuples List of (workflow_id, confidence) tuples
""" """
from documents.models import Workflow, WorkflowTrigger from documents.models import Workflow
from documents.models import WorkflowTrigger
suggestions = [] suggestions = []
try: try:
# Get all workflows with consumption triggers # Get all workflows with consumption triggers
workflows = Workflow.objects.filter( workflows = Workflow.objects.filter(
enabled=True, enabled=True,
triggers__type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION, triggers__type=WorkflowTrigger.WorkflowTriggerType.CONSUMPTION,
).distinct() ).distinct()
for workflow in workflows: for workflow in workflows:
# Evaluate workflow conditions against scan results # Evaluate workflow conditions against scan results
confidence = self._evaluate_workflow_match( confidence = self._evaluate_workflow_match(
workflow, document, scan_result workflow,
document,
scan_result,
) )
if confidence >= self.suggest_threshold: if confidence >= self.suggest_threshold:
suggestions.append((workflow.id, confidence)) suggestions.append((workflow.id, confidence))
logger.debug( logger.debug(
f"Suggested workflow: {workflow.name} " f"Suggested workflow: {workflow.name} "
f"(confidence: {confidence})" f"(confidence: {confidence})",
) )
except Exception as e: except Exception as e:
logger.error(f"Workflow suggestion failed: {e}", exc_info=True) logger.error(f"Workflow suggestion failed: {e}", exc_info=True)
return suggestions return suggestions
def _evaluate_workflow_match( def _evaluate_workflow_match(
@ -607,80 +640,80 @@ class AIDocumentScanner:
) -> float: ) -> float:
""" """
Evaluate how well a workflow matches the document. Evaluate how well a workflow matches the document.
Returns: Returns:
Confidence score (0.0 to 1.0) Confidence score (0.0 to 1.0)
""" """
# This is a simplified evaluation # This is a simplified evaluation
# In practice, you'd check workflow triggers and conditions # In practice, you'd check workflow triggers and conditions
confidence = 0.5 # Base confidence confidence = 0.5 # Base confidence
# Increase confidence if document type matches workflow expectations # Increase confidence if document type matches workflow expectations
if scan_result.document_type and workflow.actions.exists(): if scan_result.document_type and workflow.actions.exists():
confidence += 0.2 confidence += 0.2
# Increase confidence if correspondent matches # Increase confidence if correspondent matches
if scan_result.correspondent: if scan_result.correspondent:
confidence += 0.15 confidence += 0.15
# Increase confidence if tags match # Increase confidence if tags match
if scan_result.tags: if scan_result.tags:
confidence += 0.15 confidence += 0.15
return min(confidence, 1.0) return min(confidence, 1.0)
def _suggest_title( def _suggest_title(
self, self,
document: Document, document: Document,
text: str, text: str,
entities: Dict[str, Any], entities: dict[str, Any],
) -> Optional[str]: ) -> str | None:
""" """
Generate an improved title suggestion based on document content. Generate an improved title suggestion based on document content.
Returns: Returns:
Suggested title or None Suggested title or None
""" """
try: try:
# Extract key information for title # Extract key information for title
title_parts = [] title_parts = []
# Add document type if detected # Add document type if detected
if entities.get("document_type"): if entities.get("document_type"):
title_parts.append(entities["document_type"]) title_parts.append(entities["document_type"])
# Add primary organization # Add primary organization
orgs = entities.get("organizations", []) orgs = entities.get("organizations", [])
if orgs: if orgs:
title_parts.append(orgs[0]["text"][:30]) # Limit length title_parts.append(orgs[0]["text"][:30]) # Limit length
# Add date if available # Add date if available
dates = entities.get("dates", []) dates = entities.get("dates", [])
if dates: if dates:
title_parts.append(dates[0]["text"]) title_parts.append(dates[0]["text"])
if title_parts: if title_parts:
suggested_title = " - ".join(title_parts) suggested_title = " - ".join(title_parts)
logger.debug(f"Generated title suggestion: {suggested_title}") logger.debug(f"Generated title suggestion: {suggested_title}")
return suggested_title[:127] # Respect title length limit return suggested_title[:127] # Respect title length limit
except Exception as e: except Exception as e:
logger.error(f"Title suggestion failed: {e}", exc_info=True) logger.error(f"Title suggestion failed: {e}", exc_info=True)
return None return None
def _extract_tables(self, file_path: str) -> List[Dict[str, Any]]: def _extract_tables(self, file_path: str) -> list[dict[str, Any]]:
""" """
Extract tables from document using advanced OCR. Extract tables from document using advanced OCR.
Returns: Returns:
List of extracted tables with data and metadata List of extracted tables with data and metadata
""" """
extractor = self._get_table_extractor() extractor = self._get_table_extractor()
if not extractor: if not extractor:
return [] return []
try: try:
tables = extractor.extract_tables_from_image(file_path) tables = extractor.extract_tables_from_image(file_path)
logger.debug(f"Extracted {len(tables)} tables from document") logger.debug(f"Extracted {len(tables)} tables from document")
@ -695,21 +728,24 @@ class AIDocumentScanner:
scan_result: AIScanResult, scan_result: AIScanResult,
auto_apply: bool = True, auto_apply: bool = True,
user_confirmed: bool = False, user_confirmed: bool = False,
) -> Dict[str, Any]: ) -> dict[str, Any]:
""" """
Apply AI scan results to document. Apply AI scan results to document.
Args: Args:
document: Document to update document: Document to update
scan_result: AI scan results scan_result: AI scan results
auto_apply: Whether to auto-apply high confidence suggestions auto_apply: Whether to auto-apply high confidence suggestions
user_confirmed: Whether user has confirmed low-confidence changes user_confirmed: Whether user has confirmed low-confidence changes
Returns: Returns:
Dictionary with applied changes and pending suggestions Dictionary with applied changes and pending suggestions
""" """
from documents.models import Tag, Correspondent, DocumentType, StoragePath from documents.models import Correspondent
from documents.models import DocumentType
from documents.models import StoragePath
from documents.models import Tag
applied = { applied = {
"tags": [], "tags": [],
"correspondent": None, "correspondent": None,
@ -717,7 +753,7 @@ class AIDocumentScanner:
"storage_path": None, "storage_path": None,
"custom_fields": {}, "custom_fields": {},
} }
suggestions = { suggestions = {
"tags": [], "tags": [],
"correspondent": None, "correspondent": None,
@ -725,7 +761,7 @@ class AIDocumentScanner:
"storage_path": None, "storage_path": None,
"custom_fields": {}, "custom_fields": {},
} }
try: try:
with transaction.atomic(): with transaction.atomic():
# Apply tags # Apply tags
@ -737,12 +773,14 @@ class AIDocumentScanner:
logger.info(f"Auto-applied tag: {tag.name}") logger.info(f"Auto-applied tag: {tag.name}")
elif confidence >= self.suggest_threshold: elif confidence >= self.suggest_threshold:
tag = Tag.objects.get(pk=tag_id) tag = Tag.objects.get(pk=tag_id)
suggestions["tags"].append({ suggestions["tags"].append(
"id": tag_id, {
"name": tag.name, "id": tag_id,
"confidence": confidence, "name": tag.name,
}) "confidence": confidence,
},
)
# Apply correspondent # Apply correspondent
if scan_result.correspondent: if scan_result.correspondent:
corr_id, confidence = scan_result.correspondent corr_id, confidence = scan_result.correspondent
@ -761,7 +799,7 @@ class AIDocumentScanner:
"name": correspondent.name, "name": correspondent.name,
"confidence": confidence, "confidence": confidence,
} }
# Apply document type # Apply document type
if scan_result.document_type: if scan_result.document_type:
type_id, confidence = scan_result.document_type type_id, confidence = scan_result.document_type
@ -780,7 +818,7 @@ class AIDocumentScanner:
"name": doc_type.name, "name": doc_type.name,
"confidence": confidence, "confidence": confidence,
} }
# Apply storage path # Apply storage path
if scan_result.storage_path: if scan_result.storage_path:
path_id, confidence = scan_result.storage_path path_id, confidence = scan_result.storage_path
@ -799,13 +837,13 @@ class AIDocumentScanner:
"name": storage_path.name, "name": storage_path.name,
"confidence": confidence, "confidence": confidence,
} }
# Save document with changes # Save document with changes
document.save() document.save()
except Exception as e: except Exception as e:
logger.error(f"Failed to apply scan results: {e}", exc_info=True) logger.error(f"Failed to apply scan results: {e}", exc_info=True)
return { return {
"applied": applied, "applied": applied,
"suggestions": suggestions, "suggestions": suggestions,
@ -819,7 +857,7 @@ _scanner_instance = None
def get_ai_scanner() -> AIDocumentScanner: def get_ai_scanner() -> AIDocumentScanner:
""" """
Get or create the global AI scanner instance. Get or create the global AI scanner instance.
Returns: Returns:
AIDocumentScanner instance AIDocumentScanner instance
""" """

View file

@ -756,22 +756,22 @@ class ConsumerPlugin(
def _run_ai_scanner(self, document, text): def _run_ai_scanner(self, document, text):
""" """
Run AI scanner on the document to automatically detect and apply metadata. Run AI scanner on the document to automatically detect and apply metadata.
This is called during document consumption to leverage AI/ML capabilities This is called during document consumption to leverage AI/ML capabilities
for automatic metadata management as specified in agents.md. for automatic metadata management as specified in agents.md.
Args: Args:
document: The Document model instance document: The Document model instance
text: The extracted document text text: The extracted document text
""" """
try: try:
from documents.ai_scanner import get_ai_scanner from documents.ai_scanner import get_ai_scanner
scanner = get_ai_scanner() scanner = get_ai_scanner()
# Get the original file path if available # Get the original file path if available
original_file_path = str(self.working_copy) if self.working_copy else None original_file_path = str(self.working_copy) if self.working_copy else None
# Perform comprehensive AI scan # Perform comprehensive AI scan
self.log.info(f"Running AI scanner on document: {document.title}") self.log.info(f"Running AI scanner on document: {document.title}")
scan_result = scanner.scan_document( scan_result = scanner.scan_document(
@ -779,65 +779,65 @@ class ConsumerPlugin(
document_text=text, document_text=text,
original_file_path=original_file_path, original_file_path=original_file_path,
) )
# Apply scan results (auto-apply high confidence, suggest medium confidence) # Apply scan results (auto-apply high confidence, suggest medium confidence)
results = scanner.apply_scan_results( results = scanner.apply_scan_results(
document=document, document=document,
scan_result=scan_result, scan_result=scan_result,
auto_apply=True, # Auto-apply high confidence suggestions auto_apply=True, # Auto-apply high confidence suggestions
) )
# Log what was applied and suggested # Log what was applied and suggested
if results["applied"]["tags"]: if results["applied"]["tags"]:
self.log.info( self.log.info(
f"AI auto-applied tags: {[t['name'] for t in results['applied']['tags']]}" f"AI auto-applied tags: {[t['name'] for t in results['applied']['tags']]}",
) )
if results["applied"]["correspondent"]: if results["applied"]["correspondent"]:
self.log.info( self.log.info(
f"AI auto-applied correspondent: {results['applied']['correspondent']['name']}" f"AI auto-applied correspondent: {results['applied']['correspondent']['name']}",
) )
if results["applied"]["document_type"]: if results["applied"]["document_type"]:
self.log.info( self.log.info(
f"AI auto-applied document type: {results['applied']['document_type']['name']}" f"AI auto-applied document type: {results['applied']['document_type']['name']}",
) )
if results["applied"]["storage_path"]: if results["applied"]["storage_path"]:
self.log.info( self.log.info(
f"AI auto-applied storage path: {results['applied']['storage_path']['name']}" f"AI auto-applied storage path: {results['applied']['storage_path']['name']}",
) )
# Log suggestions for user review # Log suggestions for user review
if results["suggestions"]["tags"]: if results["suggestions"]["tags"]:
self.log.info( self.log.info(
f"AI suggested tags (require review): " f"AI suggested tags (require review): "
f"{[t['name'] for t in results['suggestions']['tags']]}" f"{[t['name'] for t in results['suggestions']['tags']]}",
) )
if results["suggestions"]["correspondent"]: if results["suggestions"]["correspondent"]:
self.log.info( self.log.info(
f"AI suggested correspondent (requires review): " f"AI suggested correspondent (requires review): "
f"{results['suggestions']['correspondent']['name']}" f"{results['suggestions']['correspondent']['name']}",
) )
if results["suggestions"]["document_type"]: if results["suggestions"]["document_type"]:
self.log.info( self.log.info(
f"AI suggested document type (requires review): " f"AI suggested document type (requires review): "
f"{results['suggestions']['document_type']['name']}" f"{results['suggestions']['document_type']['name']}",
) )
if results["suggestions"]["storage_path"]: if results["suggestions"]["storage_path"]:
self.log.info( self.log.info(
f"AI suggested storage path (requires review): " f"AI suggested storage path (requires review): "
f"{results['suggestions']['storage_path']['name']}" f"{results['suggestions']['storage_path']['name']}",
) )
# Store suggestions in document metadata for UI to display # Store suggestions in document metadata for UI to display
# This allows the frontend to show AI suggestions to users # This allows the frontend to show AI suggestions to users
if not hasattr(document, '_ai_suggestions'): if not hasattr(document, "_ai_suggestions"):
document._ai_suggestions = results["suggestions"] document._ai_suggestions = results["suggestions"]
except ImportError: except ImportError:
# AI scanner not available, skip # AI scanner not available, skip
self.log.debug("AI scanner not available, skipping AI analysis") self.log.debug("AI scanner not available, skipping AI analysis")