mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-14 10:36:58 +01:00
Changes before error encountered
Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
This commit is contained in:
parent
275ff4d1d4
commit
cc9e66c11c
2 changed files with 1015 additions and 0 deletions
573
src/documents/management/commands/scan_documents_ai.py
Normal file
573
src/documents/management/commands/scan_documents_ai.py
Normal file
|
|
@ -0,0 +1,573 @@
|
||||||
|
"""
|
||||||
|
Management command to apply AI scanner to existing documents.
|
||||||
|
|
||||||
|
This command allows batch processing of documents through the AI scanner,
|
||||||
|
enabling metadata suggestions for documents that were added before the
|
||||||
|
AI scanner was implemented or to re-scan documents with updated AI models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import tqdm
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from django.core.management.base import CommandError
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from documents.ai_scanner import AIScanResult
|
||||||
|
from documents.ai_scanner import get_ai_scanner
|
||||||
|
from documents.management.commands.mixins import ProgressBarMixin
|
||||||
|
from documents.models import Document
|
||||||
|
from documents.models import DocumentType
|
||||||
|
from documents.models import Tag
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.management.scan_documents_ai")
|
||||||
|
|
||||||
|
|
||||||
|
class Command(ProgressBarMixin, BaseCommand):
|
||||||
|
"""
|
||||||
|
Management command to apply AI scanner to existing documents.
|
||||||
|
|
||||||
|
This command processes existing documents through the comprehensive AI scanner
|
||||||
|
to generate metadata suggestions (tags, correspondents, document types, etc.).
|
||||||
|
"""
|
||||||
|
|
||||||
|
help = (
|
||||||
|
"Apply AI scanner to existing documents to generate metadata suggestions. "
|
||||||
|
"Supports filtering by document type, date range, and auto-apply for high "
|
||||||
|
"confidence suggestions. Use --dry-run to preview suggestions without applying."
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
"""Add command line arguments."""
|
||||||
|
# Filtering options
|
||||||
|
parser.add_argument(
|
||||||
|
"--all",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Scan all documents in the system",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--filter-by-type",
|
||||||
|
type=int,
|
||||||
|
nargs="+",
|
||||||
|
metavar="TYPE_ID",
|
||||||
|
help="Filter documents by document type ID(s). Can specify multiple IDs.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--date-range",
|
||||||
|
nargs=2,
|
||||||
|
metavar=("START_DATE", "END_DATE"),
|
||||||
|
help=(
|
||||||
|
"Filter documents by creation date range. "
|
||||||
|
"Format: YYYY-MM-DD YYYY-MM-DD. Example: 2024-01-01 2024-12-31"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--id-range",
|
||||||
|
nargs=2,
|
||||||
|
type=int,
|
||||||
|
metavar=("START_ID", "END_ID"),
|
||||||
|
help="Filter documents by ID range. Example: 1 100",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Processing options
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Preview suggestions without applying any changes",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--auto-apply-high-confidence",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help=(
|
||||||
|
"Automatically apply suggestions with high confidence (>=80%%). "
|
||||||
|
"Lower confidence suggestions will still be shown for review."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--confidence-threshold",
|
||||||
|
type=float,
|
||||||
|
default=0.60,
|
||||||
|
help=(
|
||||||
|
"Minimum confidence threshold for showing suggestions (0.0-1.0). "
|
||||||
|
"Default: 0.60 (60%%)"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Progress bar
|
||||||
|
self.add_argument_progress_bar_mixin(parser)
|
||||||
|
|
||||||
|
# Batch size for processing
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-size",
|
||||||
|
type=int,
|
||||||
|
default=100,
|
||||||
|
help="Number of documents to process in memory at once. Default: 100",
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
"""Execute the command."""
|
||||||
|
self.handle_progress_bar_mixin(**options)
|
||||||
|
|
||||||
|
# Validate arguments
|
||||||
|
self._validate_arguments(options)
|
||||||
|
|
||||||
|
# Get queryset based on filters
|
||||||
|
queryset = self._build_queryset(options)
|
||||||
|
document_count = queryset.count()
|
||||||
|
|
||||||
|
if document_count == 0:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.WARNING("No documents found matching the specified filters."),
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Initialize AI scanner
|
||||||
|
try:
|
||||||
|
scanner = get_ai_scanner()
|
||||||
|
except Exception as e:
|
||||||
|
raise CommandError(f"Failed to initialize AI scanner: {e}")
|
||||||
|
|
||||||
|
# Display operation summary
|
||||||
|
self._display_operation_summary(options, document_count)
|
||||||
|
|
||||||
|
# Process documents
|
||||||
|
results = self._process_documents(
|
||||||
|
queryset=queryset,
|
||||||
|
scanner=scanner,
|
||||||
|
options=options,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Display final summary
|
||||||
|
self._display_final_summary(results, options)
|
||||||
|
|
||||||
|
def _validate_arguments(self, options):
|
||||||
|
"""Validate command line arguments."""
|
||||||
|
# At least one filter must be specified
|
||||||
|
if not any([
|
||||||
|
options["all"],
|
||||||
|
options["filter_by_type"],
|
||||||
|
options["date_range"],
|
||||||
|
options["id_range"],
|
||||||
|
]):
|
||||||
|
raise CommandError(
|
||||||
|
"You must specify at least one filter: "
|
||||||
|
"--all, --filter-by-type, --date-range, or --id-range",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate confidence threshold
|
||||||
|
if not 0.0 <= options["confidence_threshold"] <= 1.0:
|
||||||
|
raise CommandError("Confidence threshold must be between 0.0 and 1.0")
|
||||||
|
|
||||||
|
# Validate date range format
|
||||||
|
if options["date_range"]:
|
||||||
|
try:
|
||||||
|
start_str, end_str = options["date_range"]
|
||||||
|
start_date = datetime.strptime(start_str, "%Y-%m-%d")
|
||||||
|
end_date = datetime.strptime(end_str, "%Y-%m-%d")
|
||||||
|
|
||||||
|
if start_date > end_date:
|
||||||
|
raise CommandError("Start date must be before end date")
|
||||||
|
|
||||||
|
# Store parsed dates for later use
|
||||||
|
options["_parsed_start_date"] = timezone.make_aware(start_date)
|
||||||
|
options["_parsed_end_date"] = timezone.make_aware(
|
||||||
|
end_date.replace(hour=23, minute=59, second=59),
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
raise CommandError(
|
||||||
|
f"Invalid date format. Use YYYY-MM-DD. Error: {e}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate document types exist
|
||||||
|
if options["filter_by_type"]:
|
||||||
|
for type_id in options["filter_by_type"]:
|
||||||
|
if not DocumentType.objects.filter(pk=type_id).exists():
|
||||||
|
raise CommandError(
|
||||||
|
f"Document type with ID {type_id} does not exist",
|
||||||
|
)
|
||||||
|
|
||||||
|
def _build_queryset(self, options):
|
||||||
|
"""Build document queryset based on filters."""
|
||||||
|
queryset = Document.objects.all()
|
||||||
|
|
||||||
|
# Filter by document type
|
||||||
|
if options["filter_by_type"]:
|
||||||
|
queryset = queryset.filter(document_type__id__in=options["filter_by_type"])
|
||||||
|
|
||||||
|
# Filter by date range
|
||||||
|
if options["date_range"]:
|
||||||
|
queryset = queryset.filter(
|
||||||
|
created__gte=options["_parsed_start_date"],
|
||||||
|
created__lte=options["_parsed_end_date"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter by ID range
|
||||||
|
if options["id_range"]:
|
||||||
|
start_id, end_id = options["id_range"]
|
||||||
|
queryset = queryset.filter(id__gte=start_id, id__lte=end_id)
|
||||||
|
|
||||||
|
# Order by ID for consistent processing
|
||||||
|
return queryset.order_by("id")
|
||||||
|
|
||||||
|
def _display_operation_summary(self, options, document_count):
|
||||||
|
"""Display summary of the operation before starting."""
|
||||||
|
self.stdout.write(self.style.SUCCESS("\n" + "=" * 70))
|
||||||
|
self.stdout.write(self.style.SUCCESS("AI Document Scanner - Batch Processing"))
|
||||||
|
self.stdout.write(self.style.SUCCESS("=" * 70 + "\n"))
|
||||||
|
|
||||||
|
# Display filters
|
||||||
|
self.stdout.write("Filters applied:")
|
||||||
|
if options["all"]:
|
||||||
|
self.stdout.write(" • Processing ALL documents")
|
||||||
|
if options["filter_by_type"]:
|
||||||
|
type_ids = ", ".join(str(tid) for tid in options["filter_by_type"])
|
||||||
|
self.stdout.write(f" • Document types: {type_ids}")
|
||||||
|
if options["date_range"]:
|
||||||
|
start, end = options["date_range"]
|
||||||
|
self.stdout.write(f" • Date range: {start} to {end}")
|
||||||
|
if options["id_range"]:
|
||||||
|
start, end = options["id_range"]
|
||||||
|
self.stdout.write(f" • ID range: {start} to {end}")
|
||||||
|
|
||||||
|
# Display processing mode
|
||||||
|
self.stdout.write("\nProcessing mode:")
|
||||||
|
if options["dry_run"]:
|
||||||
|
self.stdout.write(self.style.WARNING(" • DRY RUN - No changes will be applied"))
|
||||||
|
elif options["auto_apply_high_confidence"]:
|
||||||
|
self.stdout.write(" • Auto-apply high confidence suggestions (≥80%)")
|
||||||
|
else:
|
||||||
|
self.stdout.write(" • Preview mode - No changes will be applied")
|
||||||
|
|
||||||
|
self.stdout.write(
|
||||||
|
f" • Confidence threshold: {options['confidence_threshold']:.0%}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Display document count
|
||||||
|
self.stdout.write(
|
||||||
|
f"\n{self.style.SUCCESS('Documents to process:')} {document_count}",
|
||||||
|
)
|
||||||
|
self.stdout.write("\n" + "=" * 70 + "\n")
|
||||||
|
|
||||||
|
def _process_documents(
|
||||||
|
self,
|
||||||
|
queryset,
|
||||||
|
scanner,
|
||||||
|
options,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Process documents through the AI scanner.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with processing results and statistics
|
||||||
|
"""
|
||||||
|
results = {
|
||||||
|
"processed": 0,
|
||||||
|
"errors": 0,
|
||||||
|
"suggestions_generated": 0,
|
||||||
|
"auto_applied": 0,
|
||||||
|
"documents_with_suggestions": [],
|
||||||
|
"error_documents": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
batch_size = options["batch_size"]
|
||||||
|
confidence_threshold = options["confidence_threshold"]
|
||||||
|
auto_apply = options["auto_apply_high_confidence"] and not options["dry_run"]
|
||||||
|
|
||||||
|
# Process in batches
|
||||||
|
total_docs = queryset.count()
|
||||||
|
|
||||||
|
for i in tqdm.tqdm(
|
||||||
|
range(0, total_docs, batch_size),
|
||||||
|
disable=self.no_progress_bar,
|
||||||
|
desc="Processing batches",
|
||||||
|
):
|
||||||
|
batch = queryset[i:i + batch_size]
|
||||||
|
|
||||||
|
for document in batch:
|
||||||
|
try:
|
||||||
|
# Get document text
|
||||||
|
document_text = document.content or ""
|
||||||
|
|
||||||
|
if not document_text:
|
||||||
|
logger.warning(
|
||||||
|
f"Document {document.id} has no text content, skipping",
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Scan document
|
||||||
|
scan_result = scanner.scan_document(
|
||||||
|
document=document,
|
||||||
|
document_text=document_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter results by confidence threshold
|
||||||
|
filtered_result = self._filter_by_confidence(
|
||||||
|
scan_result,
|
||||||
|
confidence_threshold,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Count suggestions
|
||||||
|
suggestion_count = self._count_suggestions(filtered_result)
|
||||||
|
|
||||||
|
if suggestion_count > 0:
|
||||||
|
results["suggestions_generated"] += suggestion_count
|
||||||
|
|
||||||
|
# Apply or store suggestions
|
||||||
|
if auto_apply:
|
||||||
|
applied = scanner.apply_scan_results(
|
||||||
|
document=document,
|
||||||
|
scan_result=filtered_result,
|
||||||
|
auto_apply=True,
|
||||||
|
)
|
||||||
|
results["auto_applied"] += len(
|
||||||
|
applied.get("applied", {}).get("tags", []),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store for summary
|
||||||
|
results["documents_with_suggestions"].append({
|
||||||
|
"id": document.id,
|
||||||
|
"title": document.title,
|
||||||
|
"suggestions": filtered_result.to_dict(),
|
||||||
|
"applied": applied if auto_apply else None,
|
||||||
|
})
|
||||||
|
|
||||||
|
results["processed"] += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(
|
||||||
|
f"Error processing document {document.id}: {e}",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
results["errors"] += 1
|
||||||
|
results["error_documents"].append({
|
||||||
|
"id": document.id,
|
||||||
|
"title": document.title,
|
||||||
|
"error": str(e),
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _filter_by_confidence(
|
||||||
|
self,
|
||||||
|
scan_result: AIScanResult,
|
||||||
|
threshold: float,
|
||||||
|
) -> AIScanResult:
|
||||||
|
"""Filter scan results by confidence threshold."""
|
||||||
|
filtered = AIScanResult()
|
||||||
|
|
||||||
|
# Filter tags
|
||||||
|
filtered.tags = [
|
||||||
|
(tag_id, conf) for tag_id, conf in scan_result.tags
|
||||||
|
if conf >= threshold
|
||||||
|
]
|
||||||
|
|
||||||
|
# Filter correspondent
|
||||||
|
if scan_result.correspondent:
|
||||||
|
corr_id, conf = scan_result.correspondent
|
||||||
|
if conf >= threshold:
|
||||||
|
filtered.correspondent = scan_result.correspondent
|
||||||
|
|
||||||
|
# Filter document type
|
||||||
|
if scan_result.document_type:
|
||||||
|
type_id, conf = scan_result.document_type
|
||||||
|
if conf >= threshold:
|
||||||
|
filtered.document_type = scan_result.document_type
|
||||||
|
|
||||||
|
# Filter storage path
|
||||||
|
if scan_result.storage_path:
|
||||||
|
path_id, conf = scan_result.storage_path
|
||||||
|
if conf >= threshold:
|
||||||
|
filtered.storage_path = scan_result.storage_path
|
||||||
|
|
||||||
|
# Filter custom fields
|
||||||
|
for field_id, (value, conf) in scan_result.custom_fields.items():
|
||||||
|
if conf >= threshold:
|
||||||
|
filtered.custom_fields[field_id] = (value, conf)
|
||||||
|
|
||||||
|
# Filter workflows
|
||||||
|
filtered.workflows = [
|
||||||
|
(wf_id, conf) for wf_id, conf in scan_result.workflows
|
||||||
|
if conf >= threshold
|
||||||
|
]
|
||||||
|
|
||||||
|
# Copy other fields as-is
|
||||||
|
filtered.extracted_entities = scan_result.extracted_entities
|
||||||
|
filtered.title_suggestion = scan_result.title_suggestion
|
||||||
|
filtered.metadata = scan_result.metadata
|
||||||
|
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
def _count_suggestions(self, scan_result: AIScanResult) -> int:
|
||||||
|
"""Count total number of suggestions in scan result."""
|
||||||
|
count = 0
|
||||||
|
count += len(scan_result.tags)
|
||||||
|
count += 1 if scan_result.correspondent else 0
|
||||||
|
count += 1 if scan_result.document_type else 0
|
||||||
|
count += 1 if scan_result.storage_path else 0
|
||||||
|
count += len(scan_result.custom_fields)
|
||||||
|
count += len(scan_result.workflows)
|
||||||
|
count += 1 if scan_result.title_suggestion else 0
|
||||||
|
return count
|
||||||
|
|
||||||
|
def _display_final_summary(self, results: dict[str, Any], options):
|
||||||
|
"""Display final summary of processing results."""
|
||||||
|
self.stdout.write("\n" + "=" * 70)
|
||||||
|
self.stdout.write(self.style.SUCCESS("Processing Complete - Summary"))
|
||||||
|
self.stdout.write("=" * 70 + "\n")
|
||||||
|
|
||||||
|
# Display statistics
|
||||||
|
self.stdout.write("Statistics:")
|
||||||
|
self.stdout.write(f" • Documents processed: {results['processed']}")
|
||||||
|
self.stdout.write(f" • Documents with suggestions: {len(results['documents_with_suggestions'])}")
|
||||||
|
self.stdout.write(f" • Total suggestions generated: {results['suggestions_generated']}")
|
||||||
|
|
||||||
|
if options["auto_apply_high_confidence"] and not options["dry_run"]:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.SUCCESS(f" • Suggestions auto-applied: {results['auto_applied']}"),
|
||||||
|
)
|
||||||
|
|
||||||
|
if results["errors"] > 0:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.ERROR(f" • Errors encountered: {results['errors']}"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Display sample suggestions
|
||||||
|
if results["documents_with_suggestions"]:
|
||||||
|
self.stdout.write("\n" + "-" * 70)
|
||||||
|
self.stdout.write("Sample Suggestions (first 5 documents):\n")
|
||||||
|
|
||||||
|
for doc_info in results["documents_with_suggestions"][:5]:
|
||||||
|
self._display_document_suggestions(doc_info, options)
|
||||||
|
|
||||||
|
# Display errors
|
||||||
|
if results["error_documents"]:
|
||||||
|
self.stdout.write("\n" + "-" * 70)
|
||||||
|
self.stdout.write(self.style.ERROR("Errors:\n"))
|
||||||
|
|
||||||
|
for error_info in results["error_documents"][:10]:
|
||||||
|
self.stdout.write(
|
||||||
|
f" • Document {error_info['id']}: {error_info['title']}",
|
||||||
|
)
|
||||||
|
self.stdout.write(f" Error: {error_info['error']}")
|
||||||
|
|
||||||
|
# Final message
|
||||||
|
self.stdout.write("\n" + "=" * 70)
|
||||||
|
if options["dry_run"]:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.WARNING(
|
||||||
|
"DRY RUN completed - No changes were applied to documents.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
elif options["auto_apply_high_confidence"]:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.SUCCESS(
|
||||||
|
f"Processing complete - {results['auto_applied']} high confidence "
|
||||||
|
"suggestions were automatically applied.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.SUCCESS(
|
||||||
|
"Processing complete - Suggestions generated. Use "
|
||||||
|
"--auto-apply-high-confidence to apply them automatically.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self.stdout.write("=" * 70 + "\n")
|
||||||
|
|
||||||
|
def _display_document_suggestions(self, doc_info: dict[str, Any], options):
|
||||||
|
"""Display suggestions for a single document."""
|
||||||
|
from documents.models import Correspondent
|
||||||
|
from documents.models import DocumentType
|
||||||
|
from documents.models import StoragePath
|
||||||
|
|
||||||
|
self.stdout.write(
|
||||||
|
f"\n Document #{doc_info['id']}: {doc_info['title']}",
|
||||||
|
)
|
||||||
|
|
||||||
|
suggestions = doc_info["suggestions"]
|
||||||
|
|
||||||
|
# Tags
|
||||||
|
if suggestions.get("tags"):
|
||||||
|
self.stdout.write(" Tags:")
|
||||||
|
for tag_id, conf in suggestions["tags"][:3]: # Show first 3
|
||||||
|
try:
|
||||||
|
tag = Tag.objects.get(pk=tag_id)
|
||||||
|
self.stdout.write(
|
||||||
|
f" • {tag.name} (confidence: {conf:.0%})",
|
||||||
|
)
|
||||||
|
except Tag.DoesNotExist:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Correspondent
|
||||||
|
if suggestions.get("correspondent"):
|
||||||
|
corr_id, conf = suggestions["correspondent"]
|
||||||
|
try:
|
||||||
|
correspondent = Correspondent.objects.get(pk=corr_id)
|
||||||
|
self.stdout.write(
|
||||||
|
f" Correspondent: {correspondent.name} (confidence: {conf:.0%})",
|
||||||
|
)
|
||||||
|
except Correspondent.DoesNotExist:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Document Type
|
||||||
|
if suggestions.get("document_type"):
|
||||||
|
type_id, conf = suggestions["document_type"]
|
||||||
|
try:
|
||||||
|
doc_type = DocumentType.objects.get(pk=type_id)
|
||||||
|
self.stdout.write(
|
||||||
|
f" Document Type: {doc_type.name} (confidence: {conf:.0%})",
|
||||||
|
)
|
||||||
|
except DocumentType.DoesNotExist:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Storage Path
|
||||||
|
if suggestions.get("storage_path"):
|
||||||
|
path_id, conf = suggestions["storage_path"]
|
||||||
|
try:
|
||||||
|
storage_path = StoragePath.objects.get(pk=path_id)
|
||||||
|
self.stdout.write(
|
||||||
|
f" Storage Path: {storage_path.name} (confidence: {conf:.0%})",
|
||||||
|
)
|
||||||
|
except StoragePath.DoesNotExist:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Title suggestion
|
||||||
|
if suggestions.get("title_suggestion"):
|
||||||
|
self.stdout.write(
|
||||||
|
f" Title: {suggestions['title_suggestion']}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Applied changes (if auto-apply was enabled)
|
||||||
|
if doc_info.get("applied"):
|
||||||
|
applied = doc_info["applied"].get("applied", {})
|
||||||
|
if any(applied.values()):
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.SUCCESS(" ✓ Applied changes:"),
|
||||||
|
)
|
||||||
|
if applied.get("tags"):
|
||||||
|
tag_names = [t["name"] for t in applied["tags"]]
|
||||||
|
self.stdout.write(
|
||||||
|
f" • Tags: {', '.join(tag_names)}",
|
||||||
|
)
|
||||||
|
if applied.get("correspondent"):
|
||||||
|
self.stdout.write(
|
||||||
|
f" • Correspondent: {applied['correspondent']['name']}",
|
||||||
|
)
|
||||||
|
if applied.get("document_type"):
|
||||||
|
self.stdout.write(
|
||||||
|
f" • Type: {applied['document_type']['name']}",
|
||||||
|
)
|
||||||
|
if applied.get("storage_path"):
|
||||||
|
self.stdout.write(
|
||||||
|
f" • Path: {applied['storage_path']['name']}",
|
||||||
|
)
|
||||||
442
src/documents/tests/test_management_scan_ai.py
Normal file
442
src/documents/tests/test_management_scan_ai.py
Normal file
|
|
@ -0,0 +1,442 @@
|
||||||
|
"""
|
||||||
|
Tests for the scan_documents_ai management command.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from io import StringIO
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
from django.core.management import CommandError
|
||||||
|
from django.core.management import call_command
|
||||||
|
from django.test import TestCase
|
||||||
|
from django.test import override_settings
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from documents.ai_scanner import AIScanResult
|
||||||
|
from documents.models import Correspondent
|
||||||
|
from documents.models import Document
|
||||||
|
from documents.models import DocumentType
|
||||||
|
from documents.models import Tag
|
||||||
|
from documents.tests.utils import DirectoriesMixin
|
||||||
|
|
||||||
|
|
||||||
|
class TestScanDocumentsAICommand(DirectoriesMixin, TestCase):
|
||||||
|
"""Test cases for the scan_documents_ai management command."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Set up test data."""
|
||||||
|
super().setUp()
|
||||||
|
|
||||||
|
# Create test document types
|
||||||
|
self.doc_type_invoice = DocumentType.objects.create(name="Invoice")
|
||||||
|
self.doc_type_receipt = DocumentType.objects.create(name="Receipt")
|
||||||
|
|
||||||
|
# Create test tags
|
||||||
|
self.tag_important = Tag.objects.create(name="Important")
|
||||||
|
self.tag_tax = Tag.objects.create(name="Tax")
|
||||||
|
|
||||||
|
# Create test correspondent
|
||||||
|
self.correspondent = Correspondent.objects.create(name="Test Company")
|
||||||
|
|
||||||
|
# Create test documents
|
||||||
|
self.doc1 = Document.objects.create(
|
||||||
|
title="Test Document 1",
|
||||||
|
content="This is a test invoice document with important information.",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
checksum="ABC123",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.doc2 = Document.objects.create(
|
||||||
|
title="Test Document 2",
|
||||||
|
content="This is another test receipt document.",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
checksum="DEF456",
|
||||||
|
document_type=self.doc_type_receipt,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.doc3 = Document.objects.create(
|
||||||
|
title="Test Document 3",
|
||||||
|
content="A third document for testing date ranges.",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
checksum="GHI789",
|
||||||
|
created=timezone.now() - timezone.timedelta(days=365),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_command_requires_filter(self):
|
||||||
|
"""Test that command requires at least one filter option."""
|
||||||
|
with self.assertRaises(CommandError) as cm:
|
||||||
|
call_command("scan_documents_ai")
|
||||||
|
|
||||||
|
self.assertIn("at least one filter", str(cm.exception))
|
||||||
|
|
||||||
|
def test_command_all_flag(self):
|
||||||
|
"""Test command with --all flag."""
|
||||||
|
# Mock the AI scanner
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
# Create a mock scan result
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_result.tags = [(self.tag_important.id, 0.85)]
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--all",
|
||||||
|
"--dry-run",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
output = out.getvalue()
|
||||||
|
self.assertIn("Processing Complete", output)
|
||||||
|
self.assertIn("Documents processed:", output)
|
||||||
|
|
||||||
|
def test_command_filter_by_type(self):
|
||||||
|
"""Test command with --filter-by-type option."""
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--filter-by-type",
|
||||||
|
str(self.doc_type_receipt.id),
|
||||||
|
"--dry-run",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should only scan doc2 which has the receipt type
|
||||||
|
self.assertEqual(mock_instance.scan_document.call_count, 1)
|
||||||
|
|
||||||
|
def test_command_invalid_document_type(self):
|
||||||
|
"""Test command with invalid document type ID."""
|
||||||
|
with self.assertRaises(CommandError) as cm:
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--filter-by-type",
|
||||||
|
"99999",
|
||||||
|
"--dry-run",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("does not exist", str(cm.exception))
|
||||||
|
|
||||||
|
def test_command_date_range(self):
|
||||||
|
"""Test command with --date-range option."""
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
# Test with a date range that includes recent documents
|
||||||
|
today = timezone.now().date()
|
||||||
|
yesterday = (timezone.now() - timezone.timedelta(days=1)).date()
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--date-range",
|
||||||
|
str(yesterday),
|
||||||
|
str(today),
|
||||||
|
"--dry-run",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should scan doc1 and doc2 (recent), not doc3 (old)
|
||||||
|
self.assertGreaterEqual(mock_instance.scan_document.call_count, 2)
|
||||||
|
|
||||||
|
def test_command_invalid_date_range(self):
|
||||||
|
"""Test command with invalid date range."""
|
||||||
|
with self.assertRaises(CommandError) as cm:
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--date-range",
|
||||||
|
"2024-12-31",
|
||||||
|
"2024-01-01", # End before start
|
||||||
|
"--dry-run",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("Start date must be before end date", str(cm.exception))
|
||||||
|
|
||||||
|
def test_command_invalid_date_format(self):
|
||||||
|
"""Test command with invalid date format."""
|
||||||
|
with self.assertRaises(CommandError) as cm:
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--date-range",
|
||||||
|
"01/01/2024", # Wrong format
|
||||||
|
"12/31/2024",
|
||||||
|
"--dry-run",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("Invalid date format", str(cm.exception))
|
||||||
|
|
||||||
|
def test_command_id_range(self):
|
||||||
|
"""Test command with --id-range option."""
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--id-range",
|
||||||
|
str(self.doc1.id),
|
||||||
|
str(self.doc1.id),
|
||||||
|
"--dry-run",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should only scan doc1
|
||||||
|
self.assertEqual(mock_instance.scan_document.call_count, 1)
|
||||||
|
|
||||||
|
def test_command_confidence_threshold(self):
|
||||||
|
"""Test command with custom confidence threshold."""
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
# Create mock result with low confidence
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_result.tags = [(self.tag_important.id, 0.50)] # Low confidence
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--all",
|
||||||
|
"--dry-run",
|
||||||
|
"--confidence-threshold",
|
||||||
|
"0.40", # Lower threshold
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
output = out.getvalue()
|
||||||
|
# Should show suggestions with low confidence
|
||||||
|
self.assertIn("suggestions generated", output.lower())
|
||||||
|
|
||||||
|
def test_command_invalid_confidence_threshold(self):
|
||||||
|
"""Test command with invalid confidence threshold."""
|
||||||
|
with self.assertRaises(CommandError) as cm:
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--all",
|
||||||
|
"--confidence-threshold",
|
||||||
|
"1.5", # Invalid (> 1.0)
|
||||||
|
"--dry-run",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertIn("between 0.0 and 1.0", str(cm.exception))
|
||||||
|
|
||||||
|
def test_command_auto_apply(self):
|
||||||
|
"""Test command with --auto-apply-high-confidence."""
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
# Create mock result with high confidence
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_result.tags = [(self.tag_important.id, 0.90)]
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
# Mock apply_scan_results
|
||||||
|
mock_instance.apply_scan_results.return_value = {
|
||||||
|
"applied": {
|
||||||
|
"tags": [{"id": self.tag_important.id, "name": "Important"}],
|
||||||
|
},
|
||||||
|
"suggestions": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--all",
|
||||||
|
"--auto-apply-high-confidence",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should call apply_scan_results with auto_apply=True
|
||||||
|
self.assertTrue(mock_instance.apply_scan_results.called)
|
||||||
|
call_args = mock_instance.apply_scan_results.call_args
|
||||||
|
self.assertTrue(call_args[1]["auto_apply"])
|
||||||
|
|
||||||
|
def test_command_dry_run_does_not_apply(self):
|
||||||
|
"""Test that dry run mode does not apply changes."""
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_result.tags = [(self.tag_important.id, 0.90)]
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--all",
|
||||||
|
"--dry-run",
|
||||||
|
"--auto-apply-high-confidence", # Should be ignored
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should not call apply_scan_results in dry-run mode
|
||||||
|
self.assertFalse(mock_instance.apply_scan_results.called)
|
||||||
|
|
||||||
|
output = out.getvalue()
|
||||||
|
self.assertIn("DRY RUN", output)
|
||||||
|
|
||||||
|
def test_command_handles_document_without_content(self):
|
||||||
|
"""Test that command handles documents without content gracefully."""
|
||||||
|
# Create document without content
|
||||||
|
doc_no_content = Document.objects.create(
|
||||||
|
title="No Content Doc",
|
||||||
|
content="", # Empty content
|
||||||
|
mime_type="application/pdf",
|
||||||
|
checksum="EMPTY123",
|
||||||
|
)
|
||||||
|
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--id-range",
|
||||||
|
str(doc_no_content.id),
|
||||||
|
str(doc_no_content.id),
|
||||||
|
"--dry-run",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should not call scan_document for empty content
|
||||||
|
self.assertEqual(mock_instance.scan_document.call_count, 0)
|
||||||
|
|
||||||
|
def test_command_handles_scanner_error(self):
|
||||||
|
"""Test that command handles scanner errors gracefully."""
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
# Make scan_document raise an exception
|
||||||
|
mock_instance.scan_document.side_effect = Exception("Scanner error")
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--all",
|
||||||
|
"--dry-run",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
output = out.getvalue()
|
||||||
|
# Should report errors
|
||||||
|
self.assertIn("Errors encountered:", output)
|
||||||
|
|
||||||
|
def test_command_batch_processing(self):
|
||||||
|
"""Test that command processes documents in batches."""
|
||||||
|
# Create more documents
|
||||||
|
for i in range(10):
|
||||||
|
Document.objects.create(
|
||||||
|
title=f"Batch Doc {i}",
|
||||||
|
content=f"Content {i}",
|
||||||
|
mime_type="application/pdf",
|
||||||
|
checksum=f"BATCH{i}",
|
||||||
|
)
|
||||||
|
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--all",
|
||||||
|
"--dry-run",
|
||||||
|
"--batch-size",
|
||||||
|
"5",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should process all documents
|
||||||
|
self.assertGreaterEqual(mock_instance.scan_document.call_count, 10)
|
||||||
|
|
||||||
|
def test_command_displays_suggestions(self):
|
||||||
|
"""Test that command displays suggestions in output."""
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
# Create comprehensive scan result
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_result.tags = [(self.tag_important.id, 0.85)]
|
||||||
|
mock_result.correspondent = (self.correspondent.id, 0.80)
|
||||||
|
mock_result.document_type = (self.doc_type_invoice.id, 0.90)
|
||||||
|
mock_result.title_suggestion = "Suggested Title"
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--id-range",
|
||||||
|
str(self.doc1.id),
|
||||||
|
str(self.doc1.id),
|
||||||
|
"--dry-run",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
output = out.getvalue()
|
||||||
|
# Should display various suggestion types
|
||||||
|
self.assertIn("Sample Suggestions", output)
|
||||||
|
self.assertIn("Tags:", output)
|
||||||
|
self.assertIn("Correspondent:", output)
|
||||||
|
self.assertIn("Document Type:", output)
|
||||||
|
|
||||||
|
@override_settings(PAPERLESS_ENABLE_AI_SCANNER=False)
|
||||||
|
def test_command_works_when_ai_disabled(self):
|
||||||
|
"""Test that command can run even if AI scanner is disabled in settings."""
|
||||||
|
with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
|
||||||
|
mock_instance = mock.Mock()
|
||||||
|
mock_scanner.return_value = mock_instance
|
||||||
|
|
||||||
|
mock_result = AIScanResult()
|
||||||
|
mock_instance.scan_document.return_value = mock_result
|
||||||
|
|
||||||
|
out = StringIO()
|
||||||
|
# Should not raise an error
|
||||||
|
call_command(
|
||||||
|
"scan_documents_ai",
|
||||||
|
"--all",
|
||||||
|
"--dry-run",
|
||||||
|
"--no-progress-bar",
|
||||||
|
stdout=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
output = out.getvalue()
|
||||||
|
self.assertIn("Processing Complete", output)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue