mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-12 09:37:04 +01:00
Initial exploration: AI Scanner linting and pre-commit hooks
Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
This commit is contained in:
parent
496a9e7b7b
commit
2d7345f0bc
3 changed files with 308 additions and 268 deletions
|
|
@ -14,15 +14,9 @@ According to agents.md requirements:
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
from typing import Any
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional, Any
|
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
from django.utils import timezone
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from documents.models import Document, DeletionRequest
|
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.ai_deletion")
|
logger = logging.getLogger("paperless.ai_deletion")
|
||||||
|
|
||||||
|
|
@ -36,10 +30,10 @@ class AIDeletionManager:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_deletion_request(
|
def create_deletion_request(
|
||||||
documents: List,
|
documents: list,
|
||||||
reason: str,
|
reason: str,
|
||||||
user: User,
|
user: User,
|
||||||
impact_analysis: Optional[Dict[str, Any]] = None,
|
impact_analysis: dict[str, Any] | None = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new deletion request that requires user approval.
|
Create a new deletion request that requires user approval.
|
||||||
|
|
@ -73,7 +67,7 @@ class AIDeletionManager:
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Created deletion request {request.id} for {len(documents)} documents "
|
f"Created deletion request {request.id} for {len(documents)} documents "
|
||||||
f"requiring approval from user {user.username}"
|
f"requiring approval from user {user.username}",
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: Send notification to user about pending deletion request
|
# TODO: Send notification to user about pending deletion request
|
||||||
|
|
@ -82,7 +76,7 @@ class AIDeletionManager:
|
||||||
return request
|
return request
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _analyze_impact(documents: List) -> Dict[str, Any]:
|
def _analyze_impact(documents: list) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Analyze the impact of deleting the given documents.
|
Analyze the impact of deleting the given documents.
|
||||||
|
|
||||||
|
|
@ -128,10 +122,16 @@ class AIDeletionManager:
|
||||||
|
|
||||||
# Track date range
|
# Track date range
|
||||||
if doc.created:
|
if doc.created:
|
||||||
if impact["date_range"]["earliest"] is None or doc.created < impact["date_range"]["earliest"]:
|
if (
|
||||||
|
impact["date_range"]["earliest"] is None
|
||||||
|
or doc.created < impact["date_range"]["earliest"]
|
||||||
|
):
|
||||||
impact["date_range"]["earliest"] = doc.created
|
impact["date_range"]["earliest"] = doc.created
|
||||||
|
|
||||||
if impact["date_range"]["latest"] is None or doc.created > impact["date_range"]["latest"]:
|
if (
|
||||||
|
impact["date_range"]["latest"] is None
|
||||||
|
or doc.created > impact["date_range"]["latest"]
|
||||||
|
):
|
||||||
impact["date_range"]["latest"] = doc.created
|
impact["date_range"]["latest"] = doc.created
|
||||||
|
|
||||||
# Convert sets to lists for JSON serialization
|
# Convert sets to lists for JSON serialization
|
||||||
|
|
@ -141,14 +141,16 @@ class AIDeletionManager:
|
||||||
|
|
||||||
# Convert dates to ISO format
|
# Convert dates to ISO format
|
||||||
if impact["date_range"]["earliest"]:
|
if impact["date_range"]["earliest"]:
|
||||||
impact["date_range"]["earliest"] = impact["date_range"]["earliest"].isoformat()
|
impact["date_range"]["earliest"] = impact["date_range"][
|
||||||
|
"earliest"
|
||||||
|
].isoformat()
|
||||||
if impact["date_range"]["latest"]:
|
if impact["date_range"]["latest"]:
|
||||||
impact["date_range"]["latest"] = impact["date_range"]["latest"].isoformat()
|
impact["date_range"]["latest"] = impact["date_range"]["latest"].isoformat()
|
||||||
|
|
||||||
return impact
|
return impact
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_pending_requests(user: User) -> List:
|
def get_pending_requests(user: User) -> list:
|
||||||
"""
|
"""
|
||||||
Get all pending deletion requests for a user.
|
Get all pending deletion requests for a user.
|
||||||
|
|
||||||
|
|
@ -164,7 +166,7 @@ class AIDeletionManager:
|
||||||
DeletionRequest.objects.filter(
|
DeletionRequest.objects.filter(
|
||||||
user=user,
|
user=user,
|
||||||
status=DeletionRequest.STATUS_PENDING,
|
status=DeletionRequest.STATUS_PENDING,
|
||||||
)
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -192,25 +194,25 @@ REASON:
|
||||||
{request.ai_reason}
|
{request.ai_reason}
|
||||||
|
|
||||||
IMPACT SUMMARY:
|
IMPACT SUMMARY:
|
||||||
- Number of documents: {impact.get('document_count', 0)}
|
- Number of documents: {impact.get("document_count", 0)}
|
||||||
- Affected tags: {', '.join(impact.get('affected_tags', [])) or 'None'}
|
- Affected tags: {", ".join(impact.get("affected_tags", [])) or "None"}
|
||||||
- Affected correspondents: {', '.join(impact.get('affected_correspondents', [])) or 'None'}
|
- Affected correspondents: {", ".join(impact.get("affected_correspondents", [])) or "None"}
|
||||||
- Affected document types: {', '.join(impact.get('affected_types', [])) or 'None'}
|
- Affected document types: {", ".join(impact.get("affected_types", [])) or "None"}
|
||||||
|
|
||||||
DATE RANGE:
|
DATE RANGE:
|
||||||
- Earliest: {impact.get('date_range', {}).get('earliest', 'Unknown')}
|
- Earliest: {impact.get("date_range", {}).get("earliest", "Unknown")}
|
||||||
- Latest: {impact.get('date_range', {}).get('latest', 'Unknown')}
|
- Latest: {impact.get("date_range", {}).get("latest", "Unknown")}
|
||||||
|
|
||||||
DOCUMENTS TO BE DELETED:
|
DOCUMENTS TO BE DELETED:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for i, doc in enumerate(impact.get('documents', []), 1):
|
for i, doc in enumerate(impact.get("documents", []), 1):
|
||||||
message += f"""
|
message += f"""
|
||||||
{i}. ID: {doc['id']} - {doc['title']}
|
{i}. ID: {doc["id"]} - {doc["title"]}
|
||||||
Created: {doc['created']}
|
Created: {doc["created"]}
|
||||||
Correspondent: {doc['correspondent'] or 'None'}
|
Correspondent: {doc["correspondent"] or "None"}
|
||||||
Type: {doc['document_type'] or 'None'}
|
Type: {doc["document_type"] or "None"}
|
||||||
Tags: {', '.join(doc['tags']) or 'None'}
|
Tags: {", ".join(doc["tags"]) or "None"}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
message += """
|
message += """
|
||||||
|
|
@ -240,4 +242,4 @@ approving or rejecting this request.
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['AIDeletionManager']
|
__all__ = ["AIDeletionManager"]
|
||||||
|
|
|
||||||
|
|
@ -20,21 +20,16 @@ According to agents.md requirements:
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import TYPE_CHECKING, Dict, List, Optional, Any, Tuple
|
from typing import TYPE_CHECKING
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.db import transaction
|
from django.db import transaction
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from documents.models import (
|
from documents.models import CustomField
|
||||||
Document,
|
from documents.models import Document
|
||||||
Tag,
|
from documents.models import Workflow
|
||||||
Correspondent,
|
|
||||||
DocumentType,
|
|
||||||
StoragePath,
|
|
||||||
CustomField,
|
|
||||||
Workflow,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = logging.getLogger("paperless.ai_scanner")
|
logger = logging.getLogger("paperless.ai_scanner")
|
||||||
|
|
||||||
|
|
@ -45,17 +40,25 @@ class AIScanResult:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.tags: List[Tuple[int, float]] = [] # [(tag_id, confidence), ...]
|
self.tags: list[tuple[int, float]] = [] # [(tag_id, confidence), ...]
|
||||||
self.correspondent: Optional[Tuple[int, float]] = None # (correspondent_id, confidence)
|
self.correspondent: tuple[int, float] | None = (
|
||||||
self.document_type: Optional[Tuple[int, float]] = None # (document_type_id, confidence)
|
None # (correspondent_id, confidence)
|
||||||
self.storage_path: Optional[Tuple[int, float]] = None # (storage_path_id, confidence)
|
)
|
||||||
self.custom_fields: Dict[int, Tuple[Any, float]] = {} # {field_id: (value, confidence), ...}
|
self.document_type: tuple[int, float] | None = (
|
||||||
self.workflows: List[Tuple[int, float]] = [] # [(workflow_id, confidence), ...]
|
None # (document_type_id, confidence)
|
||||||
self.extracted_entities: Dict[str, Any] = {} # NER results
|
)
|
||||||
self.title_suggestion: Optional[str] = None
|
self.storage_path: tuple[int, float] | None = (
|
||||||
self.metadata: Dict[str, Any] = {} # Additional metadata
|
None # (storage_path_id, confidence)
|
||||||
|
)
|
||||||
|
self.custom_fields: dict[
|
||||||
|
int, tuple[Any, float],
|
||||||
|
] = {} # {field_id: (value, confidence), ...}
|
||||||
|
self.workflows: list[tuple[int, float]] = [] # [(workflow_id, confidence), ...]
|
||||||
|
self.extracted_entities: dict[str, Any] = {} # NER results
|
||||||
|
self.title_suggestion: str | None = None
|
||||||
|
self.metadata: dict[str, Any] = {} # Additional metadata
|
||||||
|
|
||||||
def to_dict(self) -> Dict[str, Any]:
|
def to_dict(self) -> dict[str, Any]:
|
||||||
"""Convert scan results to dictionary for logging/serialization."""
|
"""Convert scan results to dictionary for logging/serialization."""
|
||||||
return {
|
return {
|
||||||
"tags": self.tags,
|
"tags": self.tags,
|
||||||
|
|
@ -129,7 +132,7 @@ class AIDocumentScanner:
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"AIDocumentScanner initialized - ML: {self.ml_enabled}, "
|
f"AIDocumentScanner initialized - ML: {self.ml_enabled}, "
|
||||||
f"Advanced OCR: {self.advanced_ocr_enabled}"
|
f"Advanced OCR: {self.advanced_ocr_enabled}",
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_classifier(self):
|
def _get_classifier(self):
|
||||||
|
|
@ -137,6 +140,7 @@ class AIDocumentScanner:
|
||||||
if self._classifier is None and self.ml_enabled:
|
if self._classifier is None and self.ml_enabled:
|
||||||
try:
|
try:
|
||||||
from documents.ml.classifier import TransformerDocumentClassifier
|
from documents.ml.classifier import TransformerDocumentClassifier
|
||||||
|
|
||||||
self._classifier = TransformerDocumentClassifier()
|
self._classifier = TransformerDocumentClassifier()
|
||||||
logger.info("ML classifier loaded successfully")
|
logger.info("ML classifier loaded successfully")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -149,6 +153,7 @@ class AIDocumentScanner:
|
||||||
if self._ner_extractor is None and self.ml_enabled:
|
if self._ner_extractor is None and self.ml_enabled:
|
||||||
try:
|
try:
|
||||||
from documents.ml.ner import DocumentNER
|
from documents.ml.ner import DocumentNER
|
||||||
|
|
||||||
self._ner_extractor = DocumentNER()
|
self._ner_extractor = DocumentNER()
|
||||||
logger.info("NER extractor loaded successfully")
|
logger.info("NER extractor loaded successfully")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -160,6 +165,7 @@ class AIDocumentScanner:
|
||||||
if self._semantic_search is None and self.ml_enabled:
|
if self._semantic_search is None and self.ml_enabled:
|
||||||
try:
|
try:
|
||||||
from documents.ml.semantic_search import SemanticSearch
|
from documents.ml.semantic_search import SemanticSearch
|
||||||
|
|
||||||
self._semantic_search = SemanticSearch()
|
self._semantic_search = SemanticSearch()
|
||||||
logger.info("Semantic search loaded successfully")
|
logger.info("Semantic search loaded successfully")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -171,6 +177,7 @@ class AIDocumentScanner:
|
||||||
if self._table_extractor is None and self.advanced_ocr_enabled:
|
if self._table_extractor is None and self.advanced_ocr_enabled:
|
||||||
try:
|
try:
|
||||||
from documents.ocr.table_extractor import TableExtractor
|
from documents.ocr.table_extractor import TableExtractor
|
||||||
|
|
||||||
self._table_extractor = TableExtractor()
|
self._table_extractor = TableExtractor()
|
||||||
logger.info("Table extractor loaded successfully")
|
logger.info("Table extractor loaded successfully")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -197,7 +204,9 @@ class AIDocumentScanner:
|
||||||
Returns:
|
Returns:
|
||||||
AIScanResult containing all suggestions and extracted data
|
AIScanResult containing all suggestions and extracted data
|
||||||
"""
|
"""
|
||||||
logger.info(f"Starting AI scan for document: {document.title} (ID: {document.pk})")
|
logger.info(
|
||||||
|
f"Starting AI scan for document: {document.title} (ID: {document.pk})",
|
||||||
|
)
|
||||||
|
|
||||||
result = AIScanResult()
|
result = AIScanResult()
|
||||||
|
|
||||||
|
|
@ -205,26 +214,36 @@ class AIDocumentScanner:
|
||||||
result.extracted_entities = self._extract_entities(document_text)
|
result.extracted_entities = self._extract_entities(document_text)
|
||||||
|
|
||||||
# Analyze and suggest tags
|
# Analyze and suggest tags
|
||||||
result.tags = self._suggest_tags(document, document_text, result.extracted_entities)
|
result.tags = self._suggest_tags(
|
||||||
|
document, document_text, result.extracted_entities,
|
||||||
|
)
|
||||||
|
|
||||||
# Detect correspondent
|
# Detect correspondent
|
||||||
result.correspondent = self._detect_correspondent(
|
result.correspondent = self._detect_correspondent(
|
||||||
document, document_text, result.extracted_entities
|
document,
|
||||||
|
document_text,
|
||||||
|
result.extracted_entities,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Classify document type
|
# Classify document type
|
||||||
result.document_type = self._classify_document_type(
|
result.document_type = self._classify_document_type(
|
||||||
document, document_text, result.extracted_entities
|
document,
|
||||||
|
document_text,
|
||||||
|
result.extracted_entities,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Suggest storage path
|
# Suggest storage path
|
||||||
result.storage_path = self._suggest_storage_path(
|
result.storage_path = self._suggest_storage_path(
|
||||||
document, document_text, result
|
document,
|
||||||
|
document_text,
|
||||||
|
result,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract custom fields
|
# Extract custom fields
|
||||||
result.custom_fields = self._extract_custom_fields(
|
result.custom_fields = self._extract_custom_fields(
|
||||||
document, document_text, result.extracted_entities
|
document,
|
||||||
|
document_text,
|
||||||
|
result.extracted_entities,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Suggest workflows
|
# Suggest workflows
|
||||||
|
|
@ -232,7 +251,9 @@ class AIDocumentScanner:
|
||||||
|
|
||||||
# Generate improved title suggestion
|
# Generate improved title suggestion
|
||||||
result.title_suggestion = self._suggest_title(
|
result.title_suggestion = self._suggest_title(
|
||||||
document, document_text, result.extracted_entities
|
document,
|
||||||
|
document_text,
|
||||||
|
result.extracted_entities,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract tables if advanced OCR enabled
|
# Extract tables if advanced OCR enabled
|
||||||
|
|
@ -244,7 +265,7 @@ class AIDocumentScanner:
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _extract_entities(self, text: str) -> Dict[str, Any]:
|
def _extract_entities(self, text: str) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Extract named entities from document text using NER.
|
Extract named entities from document text using NER.
|
||||||
|
|
||||||
|
|
@ -262,13 +283,17 @@ class AIDocumentScanner:
|
||||||
# Convert string lists to dict format for consistency
|
# Convert string lists to dict format for consistency
|
||||||
for key in ["persons", "organizations", "locations", "misc"]:
|
for key in ["persons", "organizations", "locations", "misc"]:
|
||||||
if key in entities and isinstance(entities[key], list):
|
if key in entities and isinstance(entities[key], list):
|
||||||
entities[key] = [{"text": e} if isinstance(e, str) else e for e in entities[key]]
|
entities[key] = [
|
||||||
|
{"text": e} if isinstance(e, str) else e for e in entities[key]
|
||||||
|
]
|
||||||
|
|
||||||
for key in ["dates", "amounts"]:
|
for key in ["dates", "amounts"]:
|
||||||
if key in entities and isinstance(entities[key], list):
|
if key in entities and isinstance(entities[key], list):
|
||||||
entities[key] = [{"text": e} if isinstance(e, str) else e for e in entities[key]]
|
entities[key] = [
|
||||||
|
{"text": e} if isinstance(e, str) else e for e in entities[key]
|
||||||
|
]
|
||||||
|
|
||||||
logger.debug(f"Extracted entities from NER")
|
logger.debug("Extracted entities from NER")
|
||||||
return entities
|
return entities
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Entity extraction failed: {e}", exc_info=True)
|
logger.error(f"Entity extraction failed: {e}", exc_info=True)
|
||||||
|
|
@ -278,8 +303,8 @@ class AIDocumentScanner:
|
||||||
self,
|
self,
|
||||||
document: Document,
|
document: Document,
|
||||||
text: str,
|
text: str,
|
||||||
entities: Dict[str, Any],
|
entities: dict[str, Any],
|
||||||
) -> List[Tuple[int, float]]:
|
) -> list[tuple[int, float]]:
|
||||||
"""
|
"""
|
||||||
Suggest relevant tags based on document content and entities.
|
Suggest relevant tags based on document content and entities.
|
||||||
|
|
||||||
|
|
@ -291,8 +316,8 @@ class AIDocumentScanner:
|
||||||
Returns:
|
Returns:
|
||||||
List of (tag_id, confidence) tuples
|
List of (tag_id, confidence) tuples
|
||||||
"""
|
"""
|
||||||
from documents.models import Tag
|
|
||||||
from documents.matching import match_tags
|
from documents.matching import match_tags
|
||||||
|
from documents.models import Tag
|
||||||
|
|
||||||
suggestions = []
|
suggestions = []
|
||||||
|
|
||||||
|
|
@ -340,8 +365,8 @@ class AIDocumentScanner:
|
||||||
self,
|
self,
|
||||||
document: Document,
|
document: Document,
|
||||||
text: str,
|
text: str,
|
||||||
entities: Dict[str, Any],
|
entities: dict[str, Any],
|
||||||
) -> Optional[Tuple[int, float]]:
|
) -> tuple[int, float] | None:
|
||||||
"""
|
"""
|
||||||
Detect correspondent based on document content and entities.
|
Detect correspondent based on document content and entities.
|
||||||
|
|
||||||
|
|
@ -353,19 +378,21 @@ class AIDocumentScanner:
|
||||||
Returns:
|
Returns:
|
||||||
(correspondent_id, confidence) or None
|
(correspondent_id, confidence) or None
|
||||||
"""
|
"""
|
||||||
from documents.models import Correspondent
|
|
||||||
from documents.matching import match_correspondents
|
from documents.matching import match_correspondents
|
||||||
|
from documents.models import Correspondent
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use existing matching logic
|
# Use existing matching logic
|
||||||
matched_correspondents = match_correspondents(document, self._get_classifier())
|
matched_correspondents = match_correspondents(
|
||||||
|
document, self._get_classifier(),
|
||||||
|
)
|
||||||
|
|
||||||
if matched_correspondents:
|
if matched_correspondents:
|
||||||
correspondent = matched_correspondents[0]
|
correspondent = matched_correspondents[0]
|
||||||
confidence = 0.85
|
confidence = 0.85
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Detected correspondent: {correspondent.name} "
|
f"Detected correspondent: {correspondent.name} "
|
||||||
f"(confidence: {confidence})"
|
f"(confidence: {confidence})",
|
||||||
)
|
)
|
||||||
return (correspondent.id, confidence)
|
return (correspondent.id, confidence)
|
||||||
|
|
||||||
|
|
@ -374,14 +401,14 @@ class AIDocumentScanner:
|
||||||
org_name = entities["organizations"][0]["text"]
|
org_name = entities["organizations"][0]["text"]
|
||||||
# Try to find existing correspondent with similar name
|
# Try to find existing correspondent with similar name
|
||||||
correspondents = Correspondent.objects.filter(
|
correspondents = Correspondent.objects.filter(
|
||||||
name__icontains=org_name[:20] # First 20 chars
|
name__icontains=org_name[:20], # First 20 chars
|
||||||
)
|
)
|
||||||
if correspondents.exists():
|
if correspondents.exists():
|
||||||
correspondent = correspondents.first()
|
correspondent = correspondents.first()
|
||||||
confidence = 0.70
|
confidence = 0.70
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Detected correspondent from NER: {correspondent.name} "
|
f"Detected correspondent from NER: {correspondent.name} "
|
||||||
f"(confidence: {confidence})"
|
f"(confidence: {confidence})",
|
||||||
)
|
)
|
||||||
return (correspondent.id, confidence)
|
return (correspondent.id, confidence)
|
||||||
|
|
||||||
|
|
@ -394,15 +421,14 @@ class AIDocumentScanner:
|
||||||
self,
|
self,
|
||||||
document: Document,
|
document: Document,
|
||||||
text: str,
|
text: str,
|
||||||
entities: Dict[str, Any],
|
entities: dict[str, Any],
|
||||||
) -> Optional[Tuple[int, float]]:
|
) -> tuple[int, float] | None:
|
||||||
"""
|
"""
|
||||||
Classify document type using ML and content analysis.
|
Classify document type using ML and content analysis.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
(document_type_id, confidence) or None
|
(document_type_id, confidence) or None
|
||||||
"""
|
"""
|
||||||
from documents.models import DocumentType
|
|
||||||
from documents.matching import match_document_types
|
from documents.matching import match_document_types
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -414,7 +440,7 @@ class AIDocumentScanner:
|
||||||
confidence = 0.85
|
confidence = 0.85
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Classified document type: {doc_type.name} "
|
f"Classified document type: {doc_type.name} "
|
||||||
f"(confidence: {confidence})"
|
f"(confidence: {confidence})",
|
||||||
)
|
)
|
||||||
return (doc_type.id, confidence)
|
return (doc_type.id, confidence)
|
||||||
|
|
||||||
|
|
@ -435,14 +461,13 @@ class AIDocumentScanner:
|
||||||
document: Document,
|
document: Document,
|
||||||
text: str,
|
text: str,
|
||||||
scan_result: AIScanResult,
|
scan_result: AIScanResult,
|
||||||
) -> Optional[Tuple[int, float]]:
|
) -> tuple[int, float] | None:
|
||||||
"""
|
"""
|
||||||
Suggest appropriate storage path based on document characteristics.
|
Suggest appropriate storage path based on document characteristics.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
(storage_path_id, confidence) or None
|
(storage_path_id, confidence) or None
|
||||||
"""
|
"""
|
||||||
from documents.models import StoragePath
|
|
||||||
from documents.matching import match_storage_paths
|
from documents.matching import match_storage_paths
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -454,7 +479,7 @@ class AIDocumentScanner:
|
||||||
confidence = 0.80
|
confidence = 0.80
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Suggested storage path: {storage_path.name} "
|
f"Suggested storage path: {storage_path.name} "
|
||||||
f"(confidence: {confidence})"
|
f"(confidence: {confidence})",
|
||||||
)
|
)
|
||||||
return (storage_path.id, confidence)
|
return (storage_path.id, confidence)
|
||||||
|
|
||||||
|
|
@ -467,8 +492,8 @@ class AIDocumentScanner:
|
||||||
self,
|
self,
|
||||||
document: Document,
|
document: Document,
|
||||||
text: str,
|
text: str,
|
||||||
entities: Dict[str, Any],
|
entities: dict[str, Any],
|
||||||
) -> Dict[int, Tuple[Any, float]]:
|
) -> dict[int, tuple[Any, float]]:
|
||||||
"""
|
"""
|
||||||
Extract values for custom fields using NER and pattern matching.
|
Extract values for custom fields using NER and pattern matching.
|
||||||
|
|
||||||
|
|
@ -485,14 +510,16 @@ class AIDocumentScanner:
|
||||||
for field in custom_fields:
|
for field in custom_fields:
|
||||||
# Try to extract field value based on field name and type
|
# Try to extract field value based on field name and type
|
||||||
value, confidence = self._extract_field_value(
|
value, confidence = self._extract_field_value(
|
||||||
field, text, entities
|
field,
|
||||||
|
text,
|
||||||
|
entities,
|
||||||
)
|
)
|
||||||
|
|
||||||
if value is not None and confidence >= self.suggest_threshold:
|
if value is not None and confidence >= self.suggest_threshold:
|
||||||
extracted_fields[field.id] = (value, confidence)
|
extracted_fields[field.id] = (value, confidence)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Extracted custom field '{field.name}': {value} "
|
f"Extracted custom field '{field.name}': {value} "
|
||||||
f"(confidence: {confidence})"
|
f"(confidence: {confidence})",
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -504,8 +531,8 @@ class AIDocumentScanner:
|
||||||
self,
|
self,
|
||||||
field: CustomField,
|
field: CustomField,
|
||||||
text: str,
|
text: str,
|
||||||
entities: Dict[str, Any],
|
entities: dict[str, Any],
|
||||||
) -> Tuple[Any, float]:
|
) -> tuple[Any, float]:
|
||||||
"""
|
"""
|
||||||
Extract a single custom field value.
|
Extract a single custom field value.
|
||||||
|
|
||||||
|
|
@ -521,7 +548,10 @@ class AIDocumentScanner:
|
||||||
return (dates[0]["text"], 0.75)
|
return (dates[0]["text"], 0.75)
|
||||||
|
|
||||||
# Amount/price fields
|
# Amount/price fields
|
||||||
if any(keyword in field_name_lower for keyword in ["amount", "price", "cost", "total"]):
|
if any(
|
||||||
|
keyword in field_name_lower
|
||||||
|
for keyword in ["amount", "price", "cost", "total"]
|
||||||
|
):
|
||||||
amounts = entities.get("amounts", [])
|
amounts = entities.get("amounts", [])
|
||||||
if amounts:
|
if amounts:
|
||||||
return (amounts[0]["text"], 0.75)
|
return (amounts[0]["text"], 0.75)
|
||||||
|
|
@ -563,14 +593,15 @@ class AIDocumentScanner:
|
||||||
document: Document,
|
document: Document,
|
||||||
text: str,
|
text: str,
|
||||||
scan_result: AIScanResult,
|
scan_result: AIScanResult,
|
||||||
) -> List[Tuple[int, float]]:
|
) -> list[tuple[int, float]]:
|
||||||
"""
|
"""
|
||||||
Suggest relevant workflows based on document characteristics.
|
Suggest relevant workflows based on document characteristics.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of (workflow_id, confidence) tuples
|
List of (workflow_id, confidence) tuples
|
||||||
"""
|
"""
|
||||||
from documents.models import Workflow, WorkflowTrigger
|
from documents.models import Workflow
|
||||||
|
from documents.models import WorkflowTrigger
|
||||||
|
|
||||||
suggestions = []
|
suggestions = []
|
||||||
|
|
||||||
|
|
@ -584,14 +615,16 @@ class AIDocumentScanner:
|
||||||
for workflow in workflows:
|
for workflow in workflows:
|
||||||
# Evaluate workflow conditions against scan results
|
# Evaluate workflow conditions against scan results
|
||||||
confidence = self._evaluate_workflow_match(
|
confidence = self._evaluate_workflow_match(
|
||||||
workflow, document, scan_result
|
workflow,
|
||||||
|
document,
|
||||||
|
scan_result,
|
||||||
)
|
)
|
||||||
|
|
||||||
if confidence >= self.suggest_threshold:
|
if confidence >= self.suggest_threshold:
|
||||||
suggestions.append((workflow.id, confidence))
|
suggestions.append((workflow.id, confidence))
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Suggested workflow: {workflow.name} "
|
f"Suggested workflow: {workflow.name} "
|
||||||
f"(confidence: {confidence})"
|
f"(confidence: {confidence})",
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -634,8 +667,8 @@ class AIDocumentScanner:
|
||||||
self,
|
self,
|
||||||
document: Document,
|
document: Document,
|
||||||
text: str,
|
text: str,
|
||||||
entities: Dict[str, Any],
|
entities: dict[str, Any],
|
||||||
) -> Optional[str]:
|
) -> str | None:
|
||||||
"""
|
"""
|
||||||
Generate an improved title suggestion based on document content.
|
Generate an improved title suggestion based on document content.
|
||||||
|
|
||||||
|
|
@ -670,7 +703,7 @@ class AIDocumentScanner:
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _extract_tables(self, file_path: str) -> List[Dict[str, Any]]:
|
def _extract_tables(self, file_path: str) -> list[dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Extract tables from document using advanced OCR.
|
Extract tables from document using advanced OCR.
|
||||||
|
|
||||||
|
|
@ -695,7 +728,7 @@ class AIDocumentScanner:
|
||||||
scan_result: AIScanResult,
|
scan_result: AIScanResult,
|
||||||
auto_apply: bool = True,
|
auto_apply: bool = True,
|
||||||
user_confirmed: bool = False,
|
user_confirmed: bool = False,
|
||||||
) -> Dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Apply AI scan results to document.
|
Apply AI scan results to document.
|
||||||
|
|
||||||
|
|
@ -708,7 +741,10 @@ class AIDocumentScanner:
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with applied changes and pending suggestions
|
Dictionary with applied changes and pending suggestions
|
||||||
"""
|
"""
|
||||||
from documents.models import Tag, Correspondent, DocumentType, StoragePath
|
from documents.models import Correspondent
|
||||||
|
from documents.models import DocumentType
|
||||||
|
from documents.models import StoragePath
|
||||||
|
from documents.models import Tag
|
||||||
|
|
||||||
applied = {
|
applied = {
|
||||||
"tags": [],
|
"tags": [],
|
||||||
|
|
@ -737,11 +773,13 @@ class AIDocumentScanner:
|
||||||
logger.info(f"Auto-applied tag: {tag.name}")
|
logger.info(f"Auto-applied tag: {tag.name}")
|
||||||
elif confidence >= self.suggest_threshold:
|
elif confidence >= self.suggest_threshold:
|
||||||
tag = Tag.objects.get(pk=tag_id)
|
tag = Tag.objects.get(pk=tag_id)
|
||||||
suggestions["tags"].append({
|
suggestions["tags"].append(
|
||||||
"id": tag_id,
|
{
|
||||||
"name": tag.name,
|
"id": tag_id,
|
||||||
"confidence": confidence,
|
"name": tag.name,
|
||||||
})
|
"confidence": confidence,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
# Apply correspondent
|
# Apply correspondent
|
||||||
if scan_result.correspondent:
|
if scan_result.correspondent:
|
||||||
|
|
|
||||||
|
|
@ -790,52 +790,52 @@ class ConsumerPlugin(
|
||||||
# Log what was applied and suggested
|
# Log what was applied and suggested
|
||||||
if results["applied"]["tags"]:
|
if results["applied"]["tags"]:
|
||||||
self.log.info(
|
self.log.info(
|
||||||
f"AI auto-applied tags: {[t['name'] for t in results['applied']['tags']]}"
|
f"AI auto-applied tags: {[t['name'] for t in results['applied']['tags']]}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if results["applied"]["correspondent"]:
|
if results["applied"]["correspondent"]:
|
||||||
self.log.info(
|
self.log.info(
|
||||||
f"AI auto-applied correspondent: {results['applied']['correspondent']['name']}"
|
f"AI auto-applied correspondent: {results['applied']['correspondent']['name']}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if results["applied"]["document_type"]:
|
if results["applied"]["document_type"]:
|
||||||
self.log.info(
|
self.log.info(
|
||||||
f"AI auto-applied document type: {results['applied']['document_type']['name']}"
|
f"AI auto-applied document type: {results['applied']['document_type']['name']}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if results["applied"]["storage_path"]:
|
if results["applied"]["storage_path"]:
|
||||||
self.log.info(
|
self.log.info(
|
||||||
f"AI auto-applied storage path: {results['applied']['storage_path']['name']}"
|
f"AI auto-applied storage path: {results['applied']['storage_path']['name']}",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Log suggestions for user review
|
# Log suggestions for user review
|
||||||
if results["suggestions"]["tags"]:
|
if results["suggestions"]["tags"]:
|
||||||
self.log.info(
|
self.log.info(
|
||||||
f"AI suggested tags (require review): "
|
f"AI suggested tags (require review): "
|
||||||
f"{[t['name'] for t in results['suggestions']['tags']]}"
|
f"{[t['name'] for t in results['suggestions']['tags']]}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if results["suggestions"]["correspondent"]:
|
if results["suggestions"]["correspondent"]:
|
||||||
self.log.info(
|
self.log.info(
|
||||||
f"AI suggested correspondent (requires review): "
|
f"AI suggested correspondent (requires review): "
|
||||||
f"{results['suggestions']['correspondent']['name']}"
|
f"{results['suggestions']['correspondent']['name']}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if results["suggestions"]["document_type"]:
|
if results["suggestions"]["document_type"]:
|
||||||
self.log.info(
|
self.log.info(
|
||||||
f"AI suggested document type (requires review): "
|
f"AI suggested document type (requires review): "
|
||||||
f"{results['suggestions']['document_type']['name']}"
|
f"{results['suggestions']['document_type']['name']}",
|
||||||
)
|
)
|
||||||
|
|
||||||
if results["suggestions"]["storage_path"]:
|
if results["suggestions"]["storage_path"]:
|
||||||
self.log.info(
|
self.log.info(
|
||||||
f"AI suggested storage path (requires review): "
|
f"AI suggested storage path (requires review): "
|
||||||
f"{results['suggestions']['storage_path']['name']}"
|
f"{results['suggestions']['storage_path']['name']}",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Store suggestions in document metadata for UI to display
|
# Store suggestions in document metadata for UI to display
|
||||||
# This allows the frontend to show AI suggestions to users
|
# This allows the frontend to show AI suggestions to users
|
||||||
if not hasattr(document, '_ai_suggestions'):
|
if not hasattr(document, "_ai_suggestions"):
|
||||||
document._ai_suggestions = results["suggestions"]
|
document._ai_suggestions = results["suggestions"]
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue