Initial exploration: AI Scanner linting and pre-commit hooks

Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2025-11-12 13:09:03 +00:00
parent 496a9e7b7b
commit 2d7345f0bc
3 changed files with 308 additions and 268 deletions

View file

@ -14,15 +14,9 @@ According to agents.md requirements:
from __future__ import annotations
import logging
from datetime import datetime
from typing import TYPE_CHECKING, Dict, List, Optional, Any
from typing import Any
from django.conf import settings
from django.contrib.auth.models import User
from django.utils import timezone
if TYPE_CHECKING:
from documents.models import Document, DeletionRequest
logger = logging.getLogger("paperless.ai_deletion")
@ -36,10 +30,10 @@ class AIDeletionManager:
@staticmethod
def create_deletion_request(
documents: List,
documents: list,
reason: str,
user: User,
impact_analysis: Optional[Dict[str, Any]] = None,
impact_analysis: dict[str, Any] | None = None,
):
"""
Create a new deletion request that requires user approval.
@ -73,7 +67,7 @@ class AIDeletionManager:
logger.info(
f"Created deletion request {request.id} for {len(documents)} documents "
f"requiring approval from user {user.username}"
f"requiring approval from user {user.username}",
)
# TODO: Send notification to user about pending deletion request
@ -82,7 +76,7 @@ class AIDeletionManager:
return request
@staticmethod
def _analyze_impact(documents: List) -> Dict[str, Any]:
def _analyze_impact(documents: list) -> dict[str, Any]:
"""
Analyze the impact of deleting the given documents.
@ -128,10 +122,16 @@ class AIDeletionManager:
# Track date range
if doc.created:
if impact["date_range"]["earliest"] is None or doc.created < impact["date_range"]["earliest"]:
if (
impact["date_range"]["earliest"] is None
or doc.created < impact["date_range"]["earliest"]
):
impact["date_range"]["earliest"] = doc.created
if impact["date_range"]["latest"] is None or doc.created > impact["date_range"]["latest"]:
if (
impact["date_range"]["latest"] is None
or doc.created > impact["date_range"]["latest"]
):
impact["date_range"]["latest"] = doc.created
# Convert sets to lists for JSON serialization
@ -141,14 +141,16 @@ class AIDeletionManager:
# Convert dates to ISO format
if impact["date_range"]["earliest"]:
impact["date_range"]["earliest"] = impact["date_range"]["earliest"].isoformat()
impact["date_range"]["earliest"] = impact["date_range"][
"earliest"
].isoformat()
if impact["date_range"]["latest"]:
impact["date_range"]["latest"] = impact["date_range"]["latest"].isoformat()
return impact
@staticmethod
def get_pending_requests(user: User) -> List:
def get_pending_requests(user: User) -> list:
"""
Get all pending deletion requests for a user.
@ -164,7 +166,7 @@ class AIDeletionManager:
DeletionRequest.objects.filter(
user=user,
status=DeletionRequest.STATUS_PENDING,
)
),
)
@staticmethod
@ -192,25 +194,25 @@ REASON:
{request.ai_reason}
IMPACT SUMMARY:
- Number of documents: {impact.get('document_count', 0)}
- Affected tags: {', '.join(impact.get('affected_tags', [])) or 'None'}
- Affected correspondents: {', '.join(impact.get('affected_correspondents', [])) or 'None'}
- Affected document types: {', '.join(impact.get('affected_types', [])) or 'None'}
- Number of documents: {impact.get("document_count", 0)}
- Affected tags: {", ".join(impact.get("affected_tags", [])) or "None"}
- Affected correspondents: {", ".join(impact.get("affected_correspondents", [])) or "None"}
- Affected document types: {", ".join(impact.get("affected_types", [])) or "None"}
DATE RANGE:
- Earliest: {impact.get('date_range', {}).get('earliest', 'Unknown')}
- Latest: {impact.get('date_range', {}).get('latest', 'Unknown')}
- Earliest: {impact.get("date_range", {}).get("earliest", "Unknown")}
- Latest: {impact.get("date_range", {}).get("latest", "Unknown")}
DOCUMENTS TO BE DELETED:
"""
for i, doc in enumerate(impact.get('documents', []), 1):
for i, doc in enumerate(impact.get("documents", []), 1):
message += f"""
{i}. ID: {doc['id']} - {doc['title']}
Created: {doc['created']}
Correspondent: {doc['correspondent'] or 'None'}
Type: {doc['document_type'] or 'None'}
Tags: {', '.join(doc['tags']) or 'None'}
{i}. ID: {doc["id"]} - {doc["title"]}
Created: {doc["created"]}
Correspondent: {doc["correspondent"] or "None"}
Type: {doc["document_type"] or "None"}
Tags: {", ".join(doc["tags"]) or "None"}
"""
message += """
@ -240,4 +242,4 @@ approving or rejecting this request.
return False
__all__ = ['AIDeletionManager']
__all__ = ["AIDeletionManager"]

View file

@ -20,21 +20,16 @@ According to agents.md requirements:
from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Dict, List, Optional, Any, Tuple
from typing import TYPE_CHECKING
from typing import Any
from django.conf import settings
from django.db import transaction
if TYPE_CHECKING:
from documents.models import (
Document,
Tag,
Correspondent,
DocumentType,
StoragePath,
CustomField,
Workflow,
)
from documents.models import CustomField
from documents.models import Document
from documents.models import Workflow
logger = logging.getLogger("paperless.ai_scanner")
@ -45,17 +40,25 @@ class AIScanResult:
"""
def __init__(self):
self.tags: List[Tuple[int, float]] = [] # [(tag_id, confidence), ...]
self.correspondent: Optional[Tuple[int, float]] = None # (correspondent_id, confidence)
self.document_type: Optional[Tuple[int, float]] = None # (document_type_id, confidence)
self.storage_path: Optional[Tuple[int, float]] = None # (storage_path_id, confidence)
self.custom_fields: Dict[int, Tuple[Any, float]] = {} # {field_id: (value, confidence), ...}
self.workflows: List[Tuple[int, float]] = [] # [(workflow_id, confidence), ...]
self.extracted_entities: Dict[str, Any] = {} # NER results
self.title_suggestion: Optional[str] = None
self.metadata: Dict[str, Any] = {} # Additional metadata
self.tags: list[tuple[int, float]] = [] # [(tag_id, confidence), ...]
self.correspondent: tuple[int, float] | None = (
None # (correspondent_id, confidence)
)
self.document_type: tuple[int, float] | None = (
None # (document_type_id, confidence)
)
self.storage_path: tuple[int, float] | None = (
None # (storage_path_id, confidence)
)
self.custom_fields: dict[
int, tuple[Any, float],
] = {} # {field_id: (value, confidence), ...}
self.workflows: list[tuple[int, float]] = [] # [(workflow_id, confidence), ...]
self.extracted_entities: dict[str, Any] = {} # NER results
self.title_suggestion: str | None = None
self.metadata: dict[str, Any] = {} # Additional metadata
def to_dict(self) -> Dict[str, Any]:
def to_dict(self) -> dict[str, Any]:
"""Convert scan results to dictionary for logging/serialization."""
return {
"tags": self.tags,
@ -129,7 +132,7 @@ class AIDocumentScanner:
logger.info(
f"AIDocumentScanner initialized - ML: {self.ml_enabled}, "
f"Advanced OCR: {self.advanced_ocr_enabled}"
f"Advanced OCR: {self.advanced_ocr_enabled}",
)
def _get_classifier(self):
@ -137,6 +140,7 @@ class AIDocumentScanner:
if self._classifier is None and self.ml_enabled:
try:
from documents.ml.classifier import TransformerDocumentClassifier
self._classifier = TransformerDocumentClassifier()
logger.info("ML classifier loaded successfully")
except Exception as e:
@ -149,6 +153,7 @@ class AIDocumentScanner:
if self._ner_extractor is None and self.ml_enabled:
try:
from documents.ml.ner import DocumentNER
self._ner_extractor = DocumentNER()
logger.info("NER extractor loaded successfully")
except Exception as e:
@ -160,6 +165,7 @@ class AIDocumentScanner:
if self._semantic_search is None and self.ml_enabled:
try:
from documents.ml.semantic_search import SemanticSearch
self._semantic_search = SemanticSearch()
logger.info("Semantic search loaded successfully")
except Exception as e:
@ -171,6 +177,7 @@ class AIDocumentScanner:
if self._table_extractor is None and self.advanced_ocr_enabled:
try:
from documents.ocr.table_extractor import TableExtractor
self._table_extractor = TableExtractor()
logger.info("Table extractor loaded successfully")
except Exception as e:
@ -197,7 +204,9 @@ class AIDocumentScanner:
Returns:
AIScanResult containing all suggestions and extracted data
"""
logger.info(f"Starting AI scan for document: {document.title} (ID: {document.pk})")
logger.info(
f"Starting AI scan for document: {document.title} (ID: {document.pk})",
)
result = AIScanResult()
@ -205,26 +214,36 @@ class AIDocumentScanner:
result.extracted_entities = self._extract_entities(document_text)
# Analyze and suggest tags
result.tags = self._suggest_tags(document, document_text, result.extracted_entities)
result.tags = self._suggest_tags(
document, document_text, result.extracted_entities,
)
# Detect correspondent
result.correspondent = self._detect_correspondent(
document, document_text, result.extracted_entities
document,
document_text,
result.extracted_entities,
)
# Classify document type
result.document_type = self._classify_document_type(
document, document_text, result.extracted_entities
document,
document_text,
result.extracted_entities,
)
# Suggest storage path
result.storage_path = self._suggest_storage_path(
document, document_text, result
document,
document_text,
result,
)
# Extract custom fields
result.custom_fields = self._extract_custom_fields(
document, document_text, result.extracted_entities
document,
document_text,
result.extracted_entities,
)
# Suggest workflows
@ -232,7 +251,9 @@ class AIDocumentScanner:
# Generate improved title suggestion
result.title_suggestion = self._suggest_title(
document, document_text, result.extracted_entities
document,
document_text,
result.extracted_entities,
)
# Extract tables if advanced OCR enabled
@ -244,7 +265,7 @@ class AIDocumentScanner:
return result
def _extract_entities(self, text: str) -> Dict[str, Any]:
def _extract_entities(self, text: str) -> dict[str, Any]:
"""
Extract named entities from document text using NER.
@ -262,13 +283,17 @@ class AIDocumentScanner:
# Convert string lists to dict format for consistency
for key in ["persons", "organizations", "locations", "misc"]:
if key in entities and isinstance(entities[key], list):
entities[key] = [{"text": e} if isinstance(e, str) else e for e in entities[key]]
entities[key] = [
{"text": e} if isinstance(e, str) else e for e in entities[key]
]
for key in ["dates", "amounts"]:
if key in entities and isinstance(entities[key], list):
entities[key] = [{"text": e} if isinstance(e, str) else e for e in entities[key]]
entities[key] = [
{"text": e} if isinstance(e, str) else e for e in entities[key]
]
logger.debug(f"Extracted entities from NER")
logger.debug("Extracted entities from NER")
return entities
except Exception as e:
logger.error(f"Entity extraction failed: {e}", exc_info=True)
@ -278,8 +303,8 @@ class AIDocumentScanner:
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> List[Tuple[int, float]]:
entities: dict[str, Any],
) -> list[tuple[int, float]]:
"""
Suggest relevant tags based on document content and entities.
@ -291,8 +316,8 @@ class AIDocumentScanner:
Returns:
List of (tag_id, confidence) tuples
"""
from documents.models import Tag
from documents.matching import match_tags
from documents.models import Tag
suggestions = []
@ -340,8 +365,8 @@ class AIDocumentScanner:
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> Optional[Tuple[int, float]]:
entities: dict[str, Any],
) -> tuple[int, float] | None:
"""
Detect correspondent based on document content and entities.
@ -353,19 +378,21 @@ class AIDocumentScanner:
Returns:
(correspondent_id, confidence) or None
"""
from documents.models import Correspondent
from documents.matching import match_correspondents
from documents.models import Correspondent
try:
# Use existing matching logic
matched_correspondents = match_correspondents(document, self._get_classifier())
matched_correspondents = match_correspondents(
document, self._get_classifier(),
)
if matched_correspondents:
correspondent = matched_correspondents[0]
confidence = 0.85
logger.debug(
f"Detected correspondent: {correspondent.name} "
f"(confidence: {confidence})"
f"(confidence: {confidence})",
)
return (correspondent.id, confidence)
@ -374,14 +401,14 @@ class AIDocumentScanner:
org_name = entities["organizations"][0]["text"]
# Try to find existing correspondent with similar name
correspondents = Correspondent.objects.filter(
name__icontains=org_name[:20] # First 20 chars
name__icontains=org_name[:20], # First 20 chars
)
if correspondents.exists():
correspondent = correspondents.first()
confidence = 0.70
logger.debug(
f"Detected correspondent from NER: {correspondent.name} "
f"(confidence: {confidence})"
f"(confidence: {confidence})",
)
return (correspondent.id, confidence)
@ -394,15 +421,14 @@ class AIDocumentScanner:
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> Optional[Tuple[int, float]]:
entities: dict[str, Any],
) -> tuple[int, float] | None:
"""
Classify document type using ML and content analysis.
Returns:
(document_type_id, confidence) or None
"""
from documents.models import DocumentType
from documents.matching import match_document_types
try:
@ -414,7 +440,7 @@ class AIDocumentScanner:
confidence = 0.85
logger.debug(
f"Classified document type: {doc_type.name} "
f"(confidence: {confidence})"
f"(confidence: {confidence})",
)
return (doc_type.id, confidence)
@ -435,14 +461,13 @@ class AIDocumentScanner:
document: Document,
text: str,
scan_result: AIScanResult,
) -> Optional[Tuple[int, float]]:
) -> tuple[int, float] | None:
"""
Suggest appropriate storage path based on document characteristics.
Returns:
(storage_path_id, confidence) or None
"""
from documents.models import StoragePath
from documents.matching import match_storage_paths
try:
@ -454,7 +479,7 @@ class AIDocumentScanner:
confidence = 0.80
logger.debug(
f"Suggested storage path: {storage_path.name} "
f"(confidence: {confidence})"
f"(confidence: {confidence})",
)
return (storage_path.id, confidence)
@ -467,8 +492,8 @@ class AIDocumentScanner:
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> Dict[int, Tuple[Any, float]]:
entities: dict[str, Any],
) -> dict[int, tuple[Any, float]]:
"""
Extract values for custom fields using NER and pattern matching.
@ -485,14 +510,16 @@ class AIDocumentScanner:
for field in custom_fields:
# Try to extract field value based on field name and type
value, confidence = self._extract_field_value(
field, text, entities
field,
text,
entities,
)
if value is not None and confidence >= self.suggest_threshold:
extracted_fields[field.id] = (value, confidence)
logger.debug(
f"Extracted custom field '{field.name}': {value} "
f"(confidence: {confidence})"
f"(confidence: {confidence})",
)
except Exception as e:
@ -504,8 +531,8 @@ class AIDocumentScanner:
self,
field: CustomField,
text: str,
entities: Dict[str, Any],
) -> Tuple[Any, float]:
entities: dict[str, Any],
) -> tuple[Any, float]:
"""
Extract a single custom field value.
@ -521,7 +548,10 @@ class AIDocumentScanner:
return (dates[0]["text"], 0.75)
# Amount/price fields
if any(keyword in field_name_lower for keyword in ["amount", "price", "cost", "total"]):
if any(
keyword in field_name_lower
for keyword in ["amount", "price", "cost", "total"]
):
amounts = entities.get("amounts", [])
if amounts:
return (amounts[0]["text"], 0.75)
@ -563,14 +593,15 @@ class AIDocumentScanner:
document: Document,
text: str,
scan_result: AIScanResult,
) -> List[Tuple[int, float]]:
) -> list[tuple[int, float]]:
"""
Suggest relevant workflows based on document characteristics.
Returns:
List of (workflow_id, confidence) tuples
"""
from documents.models import Workflow, WorkflowTrigger
from documents.models import Workflow
from documents.models import WorkflowTrigger
suggestions = []
@ -584,14 +615,16 @@ class AIDocumentScanner:
for workflow in workflows:
# Evaluate workflow conditions against scan results
confidence = self._evaluate_workflow_match(
workflow, document, scan_result
workflow,
document,
scan_result,
)
if confidence >= self.suggest_threshold:
suggestions.append((workflow.id, confidence))
logger.debug(
f"Suggested workflow: {workflow.name} "
f"(confidence: {confidence})"
f"(confidence: {confidence})",
)
except Exception as e:
@ -634,8 +667,8 @@ class AIDocumentScanner:
self,
document: Document,
text: str,
entities: Dict[str, Any],
) -> Optional[str]:
entities: dict[str, Any],
) -> str | None:
"""
Generate an improved title suggestion based on document content.
@ -670,7 +703,7 @@ class AIDocumentScanner:
return None
def _extract_tables(self, file_path: str) -> List[Dict[str, Any]]:
def _extract_tables(self, file_path: str) -> list[dict[str, Any]]:
"""
Extract tables from document using advanced OCR.
@ -695,7 +728,7 @@ class AIDocumentScanner:
scan_result: AIScanResult,
auto_apply: bool = True,
user_confirmed: bool = False,
) -> Dict[str, Any]:
) -> dict[str, Any]:
"""
Apply AI scan results to document.
@ -708,7 +741,10 @@ class AIDocumentScanner:
Returns:
Dictionary with applied changes and pending suggestions
"""
from documents.models import Tag, Correspondent, DocumentType, StoragePath
from documents.models import Correspondent
from documents.models import DocumentType
from documents.models import StoragePath
from documents.models import Tag
applied = {
"tags": [],
@ -737,11 +773,13 @@ class AIDocumentScanner:
logger.info(f"Auto-applied tag: {tag.name}")
elif confidence >= self.suggest_threshold:
tag = Tag.objects.get(pk=tag_id)
suggestions["tags"].append({
suggestions["tags"].append(
{
"id": tag_id,
"name": tag.name,
"confidence": confidence,
})
},
)
# Apply correspondent
if scan_result.correspondent:

View file

@ -790,52 +790,52 @@ class ConsumerPlugin(
# Log what was applied and suggested
if results["applied"]["tags"]:
self.log.info(
f"AI auto-applied tags: {[t['name'] for t in results['applied']['tags']]}"
f"AI auto-applied tags: {[t['name'] for t in results['applied']['tags']]}",
)
if results["applied"]["correspondent"]:
self.log.info(
f"AI auto-applied correspondent: {results['applied']['correspondent']['name']}"
f"AI auto-applied correspondent: {results['applied']['correspondent']['name']}",
)
if results["applied"]["document_type"]:
self.log.info(
f"AI auto-applied document type: {results['applied']['document_type']['name']}"
f"AI auto-applied document type: {results['applied']['document_type']['name']}",
)
if results["applied"]["storage_path"]:
self.log.info(
f"AI auto-applied storage path: {results['applied']['storage_path']['name']}"
f"AI auto-applied storage path: {results['applied']['storage_path']['name']}",
)
# Log suggestions for user review
if results["suggestions"]["tags"]:
self.log.info(
f"AI suggested tags (require review): "
f"{[t['name'] for t in results['suggestions']['tags']]}"
f"{[t['name'] for t in results['suggestions']['tags']]}",
)
if results["suggestions"]["correspondent"]:
self.log.info(
f"AI suggested correspondent (requires review): "
f"{results['suggestions']['correspondent']['name']}"
f"{results['suggestions']['correspondent']['name']}",
)
if results["suggestions"]["document_type"]:
self.log.info(
f"AI suggested document type (requires review): "
f"{results['suggestions']['document_type']['name']}"
f"{results['suggestions']['document_type']['name']}",
)
if results["suggestions"]["storage_path"]:
self.log.info(
f"AI suggested storage path (requires review): "
f"{results['suggestions']['storage_path']['name']}"
f"{results['suggestions']['storage_path']['name']}",
)
# Store suggestions in document metadata for UI to display
# This allows the frontend to show AI suggestions to users
if not hasattr(document, '_ai_suggestions'):
if not hasattr(document, "_ai_suggestions"):
document._ai_suggestions = results["suggestions"]
except ImportError: