Changes before error encountered

Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
2025-12-14 10:36:58 +01:00 · 2025-11-12 15:39:22 +00:00 · 2025-11-12 15:39:22 +00:00 · cc9e66c11c
commit cc9e66c11c
parent 275ff4d1d4
2 changed files with 1015 additions and 0 deletions
--- a/src/documents/management/commands/scan_documents_ai.py
+++ b/src/documents/management/commands/scan_documents_ai.py
@ -0,0 +1,573 @@
+"""
+Management command to apply AI scanner to existing documents.
+
+This command allows batch processing of documents through the AI scanner,
+enabling metadata suggestions for documents that were added before the
+AI scanner was implemented or to re-scan documents with updated AI models.
+"""
+
+import logging
+from datetime import datetime
+from typing import Any
+
+import tqdm
+from django.core.management.base import BaseCommand
+from django.core.management.base import CommandError
+from django.utils import timezone
+
+from documents.ai_scanner import AIScanResult
+from documents.ai_scanner import get_ai_scanner
+from documents.management.commands.mixins import ProgressBarMixin
+from documents.models import Document
+from documents.models import DocumentType
+from documents.models import Tag
+
+logger = logging.getLogger("paperless.management.scan_documents_ai")
+
+
+class Command(ProgressBarMixin, BaseCommand):
+    """
+    Management command to apply AI scanner to existing documents.
+    
+    This command processes existing documents through the comprehensive AI scanner
+    to generate metadata suggestions (tags, correspondents, document types, etc.).
+    """
+
+    help = (
+        "Apply AI scanner to existing documents to generate metadata suggestions. "
+        "Supports filtering by document type, date range, and auto-apply for high "
+        "confidence suggestions. Use --dry-run to preview suggestions without applying."
+    )
+
+    def add_arguments(self, parser):
+        """Add command line arguments."""
+        # Filtering options
+        parser.add_argument(
+            "--all",
+            action="store_true",
+            default=False,
+            help="Scan all documents in the system",
+        )
+
+        parser.add_argument(
+            "--filter-by-type",
+            type=int,
+            nargs="+",
+            metavar="TYPE_ID",
+            help="Filter documents by document type ID(s). Can specify multiple IDs.",
+        )
+
+        parser.add_argument(
+            "--date-range",
+            nargs=2,
+            metavar=("START_DATE", "END_DATE"),
+            help=(
+                "Filter documents by creation date range. "
+                "Format: YYYY-MM-DD YYYY-MM-DD. Example: 2024-01-01 2024-12-31"
+            ),
+        )
+
+        parser.add_argument(
+            "--id-range",
+            nargs=2,
+            type=int,
+            metavar=("START_ID", "END_ID"),
+            help="Filter documents by ID range. Example: 1 100",
+        )
+
+        # Processing options
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            default=False,
+            help="Preview suggestions without applying any changes",
+        )
+
+        parser.add_argument(
+            "--auto-apply-high-confidence",
+            action="store_true",
+            default=False,
+            help=(
+                "Automatically apply suggestions with high confidence (>=80%%). "
+                "Lower confidence suggestions will still be shown for review."
+            ),
+        )
+
+        parser.add_argument(
+            "--confidence-threshold",
+            type=float,
+            default=0.60,
+            help=(
+                "Minimum confidence threshold for showing suggestions (0.0-1.0). "
+                "Default: 0.60 (60%%)"
+            ),
+        )
+
+        # Progress bar
+        self.add_argument_progress_bar_mixin(parser)
+
+        # Batch size for processing
+        parser.add_argument(
+            "--batch-size",
+            type=int,
+            default=100,
+            help="Number of documents to process in memory at once. Default: 100",
+        )
+
+    def handle(self, *args, **options):
+        """Execute the command."""
+        self.handle_progress_bar_mixin(**options)
+
+        # Validate arguments
+        self._validate_arguments(options)
+
+        # Get queryset based on filters
+        queryset = self._build_queryset(options)
+        document_count = queryset.count()
+
+        if document_count == 0:
+            self.stdout.write(
+                self.style.WARNING("No documents found matching the specified filters."),
+            )
+            return
+
+        # Initialize AI scanner
+        try:
+            scanner = get_ai_scanner()
+        except Exception as e:
+            raise CommandError(f"Failed to initialize AI scanner: {e}")
+
+        # Display operation summary
+        self._display_operation_summary(options, document_count)
+
+        # Process documents
+        results = self._process_documents(
+            queryset=queryset,
+            scanner=scanner,
+            options=options,
+        )
+
+        # Display final summary
+        self._display_final_summary(results, options)
+
+    def _validate_arguments(self, options):
+        """Validate command line arguments."""
+        # At least one filter must be specified
+        if not any([
+            options["all"],
+            options["filter_by_type"],
+            options["date_range"],
+            options["id_range"],
+        ]):
+            raise CommandError(
+                "You must specify at least one filter: "
+                "--all, --filter-by-type, --date-range, or --id-range",
+            )
+
+        # Validate confidence threshold
+        if not 0.0 <= options["confidence_threshold"] <= 1.0:
+            raise CommandError("Confidence threshold must be between 0.0 and 1.0")
+
+        # Validate date range format
+        if options["date_range"]:
+            try:
+                start_str, end_str = options["date_range"]
+                start_date = datetime.strptime(start_str, "%Y-%m-%d")
+                end_date = datetime.strptime(end_str, "%Y-%m-%d")
+
+                if start_date > end_date:
+                    raise CommandError("Start date must be before end date")
+
+                # Store parsed dates for later use
+                options["_parsed_start_date"] = timezone.make_aware(start_date)
+                options["_parsed_end_date"] = timezone.make_aware(
+                    end_date.replace(hour=23, minute=59, second=59),
+                )
+            except ValueError as e:
+                raise CommandError(
+                    f"Invalid date format. Use YYYY-MM-DD. Error: {e}",
+                )
+
+        # Validate document types exist
+        if options["filter_by_type"]:
+            for type_id in options["filter_by_type"]:
+                if not DocumentType.objects.filter(pk=type_id).exists():
+                    raise CommandError(
+                        f"Document type with ID {type_id} does not exist",
+                    )
+
+    def _build_queryset(self, options):
+        """Build document queryset based on filters."""
+        queryset = Document.objects.all()
+
+        # Filter by document type
+        if options["filter_by_type"]:
+            queryset = queryset.filter(document_type__id__in=options["filter_by_type"])
+
+        # Filter by date range
+        if options["date_range"]:
+            queryset = queryset.filter(
+                created__gte=options["_parsed_start_date"],
+                created__lte=options["_parsed_end_date"],
+            )
+
+        # Filter by ID range
+        if options["id_range"]:
+            start_id, end_id = options["id_range"]
+            queryset = queryset.filter(id__gte=start_id, id__lte=end_id)
+
+        # Order by ID for consistent processing
+        return queryset.order_by("id")
+
+    def _display_operation_summary(self, options, document_count):
+        """Display summary of the operation before starting."""
+        self.stdout.write(self.style.SUCCESS("\n" + "=" * 70))
+        self.stdout.write(self.style.SUCCESS("AI Document Scanner - Batch Processing"))
+        self.stdout.write(self.style.SUCCESS("=" * 70 + "\n"))
+
+        # Display filters
+        self.stdout.write("Filters applied:")
+        if options["all"]:
+            self.stdout.write("  • Processing ALL documents")
+        if options["filter_by_type"]:
+            type_ids = ", ".join(str(tid) for tid in options["filter_by_type"])
+            self.stdout.write(f"  • Document types: {type_ids}")
+        if options["date_range"]:
+            start, end = options["date_range"]
+            self.stdout.write(f"  • Date range: {start} to {end}")
+        if options["id_range"]:
+            start, end = options["id_range"]
+            self.stdout.write(f"  • ID range: {start} to {end}")
+
+        # Display processing mode
+        self.stdout.write("\nProcessing mode:")
+        if options["dry_run"]:
+            self.stdout.write(self.style.WARNING("  • DRY RUN - No changes will be applied"))
+        elif options["auto_apply_high_confidence"]:
+            self.stdout.write("  • Auto-apply high confidence suggestions (≥80%)")
+        else:
+            self.stdout.write("  • Preview mode - No changes will be applied")
+
+        self.stdout.write(
+            f"  • Confidence threshold: {options['confidence_threshold']:.0%}",
+        )
+
+        # Display document count
+        self.stdout.write(
+            f"\n{self.style.SUCCESS('Documents to process:')} {document_count}",
+        )
+        self.stdout.write("\n" + "=" * 70 + "\n")
+
+    def _process_documents(
+        self,
+        queryset,
+        scanner,
+        options,
+    ) -> dict[str, Any]:
+        """
+        Process documents through the AI scanner.
+        
+        Returns:
+            Dictionary with processing results and statistics
+        """
+        results = {
+            "processed": 0,
+            "errors": 0,
+            "suggestions_generated": 0,
+            "auto_applied": 0,
+            "documents_with_suggestions": [],
+            "error_documents": [],
+        }
+
+        batch_size = options["batch_size"]
+        confidence_threshold = options["confidence_threshold"]
+        auto_apply = options["auto_apply_high_confidence"] and not options["dry_run"]
+
+        # Process in batches
+        total_docs = queryset.count()
+
+        for i in tqdm.tqdm(
+            range(0, total_docs, batch_size),
+            disable=self.no_progress_bar,
+            desc="Processing batches",
+        ):
+            batch = queryset[i:i + batch_size]
+
+            for document in batch:
+                try:
+                    # Get document text
+                    document_text = document.content or ""
+
+                    if not document_text:
+                        logger.warning(
+                            f"Document {document.id} has no text content, skipping",
+                        )
+                        continue
+
+                    # Scan document
+                    scan_result = scanner.scan_document(
+                        document=document,
+                        document_text=document_text,
+                    )
+
+                    # Filter results by confidence threshold
+                    filtered_result = self._filter_by_confidence(
+                        scan_result,
+                        confidence_threshold,
+                    )
+
+                    # Count suggestions
+                    suggestion_count = self._count_suggestions(filtered_result)
+
+                    if suggestion_count > 0:
+                        results["suggestions_generated"] += suggestion_count
+
+                        # Apply or store suggestions
+                        if auto_apply:
+                            applied = scanner.apply_scan_results(
+                                document=document,
+                                scan_result=filtered_result,
+                                auto_apply=True,
+                            )
+                            results["auto_applied"] += len(
+                                applied.get("applied", {}).get("tags", []),
+                            )
+
+                        # Store for summary
+                        results["documents_with_suggestions"].append({
+                            "id": document.id,
+                            "title": document.title,
+                            "suggestions": filtered_result.to_dict(),
+                            "applied": applied if auto_apply else None,
+                        })
+
+                    results["processed"] += 1
+
+                except Exception as e:
+                    logger.error(
+                        f"Error processing document {document.id}: {e}",
+                        exc_info=True,
+                    )
+                    results["errors"] += 1
+                    results["error_documents"].append({
+                        "id": document.id,
+                        "title": document.title,
+                        "error": str(e),
+                    })
+
+        return results
+
+    def _filter_by_confidence(
+        self,
+        scan_result: AIScanResult,
+        threshold: float,
+    ) -> AIScanResult:
+        """Filter scan results by confidence threshold."""
+        filtered = AIScanResult()
+
+        # Filter tags
+        filtered.tags = [
+            (tag_id, conf) for tag_id, conf in scan_result.tags
+            if conf >= threshold
+        ]
+
+        # Filter correspondent
+        if scan_result.correspondent:
+            corr_id, conf = scan_result.correspondent
+            if conf >= threshold:
+                filtered.correspondent = scan_result.correspondent
+
+        # Filter document type
+        if scan_result.document_type:
+            type_id, conf = scan_result.document_type
+            if conf >= threshold:
+                filtered.document_type = scan_result.document_type
+
+        # Filter storage path
+        if scan_result.storage_path:
+            path_id, conf = scan_result.storage_path
+            if conf >= threshold:
+                filtered.storage_path = scan_result.storage_path
+
+        # Filter custom fields
+        for field_id, (value, conf) in scan_result.custom_fields.items():
+            if conf >= threshold:
+                filtered.custom_fields[field_id] = (value, conf)
+
+        # Filter workflows
+        filtered.workflows = [
+            (wf_id, conf) for wf_id, conf in scan_result.workflows
+            if conf >= threshold
+        ]
+
+        # Copy other fields as-is
+        filtered.extracted_entities = scan_result.extracted_entities
+        filtered.title_suggestion = scan_result.title_suggestion
+        filtered.metadata = scan_result.metadata
+
+        return filtered
+
+    def _count_suggestions(self, scan_result: AIScanResult) -> int:
+        """Count total number of suggestions in scan result."""
+        count = 0
+        count += len(scan_result.tags)
+        count += 1 if scan_result.correspondent else 0
+        count += 1 if scan_result.document_type else 0
+        count += 1 if scan_result.storage_path else 0
+        count += len(scan_result.custom_fields)
+        count += len(scan_result.workflows)
+        count += 1 if scan_result.title_suggestion else 0
+        return count
+
+    def _display_final_summary(self, results: dict[str, Any], options):
+        """Display final summary of processing results."""
+        self.stdout.write("\n" + "=" * 70)
+        self.stdout.write(self.style.SUCCESS("Processing Complete - Summary"))
+        self.stdout.write("=" * 70 + "\n")
+
+        # Display statistics
+        self.stdout.write("Statistics:")
+        self.stdout.write(f"  • Documents processed: {results['processed']}")
+        self.stdout.write(f"  • Documents with suggestions: {len(results['documents_with_suggestions'])}")
+        self.stdout.write(f"  • Total suggestions generated: {results['suggestions_generated']}")
+
+        if options["auto_apply_high_confidence"] and not options["dry_run"]:
+            self.stdout.write(
+                self.style.SUCCESS(f"  • Suggestions auto-applied: {results['auto_applied']}"),
+            )
+
+        if results["errors"] > 0:
+            self.stdout.write(
+                self.style.ERROR(f"  • Errors encountered: {results['errors']}"),
+            )
+
+        # Display sample suggestions
+        if results["documents_with_suggestions"]:
+            self.stdout.write("\n" + "-" * 70)
+            self.stdout.write("Sample Suggestions (first 5 documents):\n")
+
+            for doc_info in results["documents_with_suggestions"][:5]:
+                self._display_document_suggestions(doc_info, options)
+
+        # Display errors
+        if results["error_documents"]:
+            self.stdout.write("\n" + "-" * 70)
+            self.stdout.write(self.style.ERROR("Errors:\n"))
+
+            for error_info in results["error_documents"][:10]:
+                self.stdout.write(
+                    f"  • Document {error_info['id']}: {error_info['title']}",
+                )
+                self.stdout.write(f"    Error: {error_info['error']}")
+
+        # Final message
+        self.stdout.write("\n" + "=" * 70)
+        if options["dry_run"]:
+            self.stdout.write(
+                self.style.WARNING(
+                    "DRY RUN completed - No changes were applied to documents.",
+                ),
+            )
+        elif options["auto_apply_high_confidence"]:
+            self.stdout.write(
+                self.style.SUCCESS(
+                    f"Processing complete - {results['auto_applied']} high confidence "
+                    "suggestions were automatically applied.",
+                ),
+            )
+        else:
+            self.stdout.write(
+                self.style.SUCCESS(
+                    "Processing complete - Suggestions generated. Use "
+                    "--auto-apply-high-confidence to apply them automatically.",
+                ),
+            )
+        self.stdout.write("=" * 70 + "\n")
+
+    def _display_document_suggestions(self, doc_info: dict[str, Any], options):
+        """Display suggestions for a single document."""
+        from documents.models import Correspondent
+        from documents.models import DocumentType
+        from documents.models import StoragePath
+
+        self.stdout.write(
+            f"\n  Document #{doc_info['id']}: {doc_info['title']}",
+        )
+
+        suggestions = doc_info["suggestions"]
+
+        # Tags
+        if suggestions.get("tags"):
+            self.stdout.write("    Tags:")
+            for tag_id, conf in suggestions["tags"][:3]:  # Show first 3
+                try:
+                    tag = Tag.objects.get(pk=tag_id)
+                    self.stdout.write(
+                        f"      • {tag.name} (confidence: {conf:.0%})",
+                    )
+                except Tag.DoesNotExist:
+                    pass
+
+        # Correspondent
+        if suggestions.get("correspondent"):
+            corr_id, conf = suggestions["correspondent"]
+            try:
+                correspondent = Correspondent.objects.get(pk=corr_id)
+                self.stdout.write(
+                    f"    Correspondent: {correspondent.name} (confidence: {conf:.0%})",
+                )
+            except Correspondent.DoesNotExist:
+                pass
+
+        # Document Type
+        if suggestions.get("document_type"):
+            type_id, conf = suggestions["document_type"]
+            try:
+                doc_type = DocumentType.objects.get(pk=type_id)
+                self.stdout.write(
+                    f"    Document Type: {doc_type.name} (confidence: {conf:.0%})",
+                )
+            except DocumentType.DoesNotExist:
+                pass
+
+        # Storage Path
+        if suggestions.get("storage_path"):
+            path_id, conf = suggestions["storage_path"]
+            try:
+                storage_path = StoragePath.objects.get(pk=path_id)
+                self.stdout.write(
+                    f"    Storage Path: {storage_path.name} (confidence: {conf:.0%})",
+                )
+            except StoragePath.DoesNotExist:
+                pass
+
+        # Title suggestion
+        if suggestions.get("title_suggestion"):
+            self.stdout.write(
+                f"    Title: {suggestions['title_suggestion']}",
+            )
+
+        # Applied changes (if auto-apply was enabled)
+        if doc_info.get("applied"):
+            applied = doc_info["applied"].get("applied", {})
+            if any(applied.values()):
+                self.stdout.write(
+                    self.style.SUCCESS("    ✓ Applied changes:"),
+                )
+                if applied.get("tags"):
+                    tag_names = [t["name"] for t in applied["tags"]]
+                    self.stdout.write(
+                        f"      • Tags: {', '.join(tag_names)}",
+                    )
+                if applied.get("correspondent"):
+                    self.stdout.write(
+                        f"      • Correspondent: {applied['correspondent']['name']}",
+                    )
+                if applied.get("document_type"):
+                    self.stdout.write(
+                        f"      • Type: {applied['document_type']['name']}",
+                    )
+                if applied.get("storage_path"):
+                    self.stdout.write(
+                        f"      • Path: {applied['storage_path']['name']}",
+                    )
--- a/src/documents/tests/test_management_scan_ai.py
+++ b/src/documents/tests/test_management_scan_ai.py
@ -0,0 +1,442 @@
+"""
+Tests for the scan_documents_ai management command.
+"""
+
+from io import StringIO
+from unittest import mock
+
+from django.core.management import CommandError
+from django.core.management import call_command
+from django.test import TestCase
+from django.test import override_settings
+from django.utils import timezone
+
+from documents.ai_scanner import AIScanResult
+from documents.models import Correspondent
+from documents.models import Document
+from documents.models import DocumentType
+from documents.models import Tag
+from documents.tests.utils import DirectoriesMixin
+
+
+class TestScanDocumentsAICommand(DirectoriesMixin, TestCase):
+    """Test cases for the scan_documents_ai management command."""
+
+    def setUp(self):
+        """Set up test data."""
+        super().setUp()
+
+        # Create test document types
+        self.doc_type_invoice = DocumentType.objects.create(name="Invoice")
+        self.doc_type_receipt = DocumentType.objects.create(name="Receipt")
+
+        # Create test tags
+        self.tag_important = Tag.objects.create(name="Important")
+        self.tag_tax = Tag.objects.create(name="Tax")
+
+        # Create test correspondent
+        self.correspondent = Correspondent.objects.create(name="Test Company")
+
+        # Create test documents
+        self.doc1 = Document.objects.create(
+            title="Test Document 1",
+            content="This is a test invoice document with important information.",
+            mime_type="application/pdf",
+            checksum="ABC123",
+        )
+
+        self.doc2 = Document.objects.create(
+            title="Test Document 2",
+            content="This is another test receipt document.",
+            mime_type="application/pdf",
+            checksum="DEF456",
+            document_type=self.doc_type_receipt,
+        )
+
+        self.doc3 = Document.objects.create(
+            title="Test Document 3",
+            content="A third document for testing date ranges.",
+            mime_type="application/pdf",
+            checksum="GHI789",
+            created=timezone.now() - timezone.timedelta(days=365),
+        )
+
+    def test_command_requires_filter(self):
+        """Test that command requires at least one filter option."""
+        with self.assertRaises(CommandError) as cm:
+            call_command("scan_documents_ai")
+
+        self.assertIn("at least one filter", str(cm.exception))
+
+    def test_command_all_flag(self):
+        """Test command with --all flag."""
+        # Mock the AI scanner
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            # Create a mock scan result
+            mock_result = AIScanResult()
+            mock_result.tags = [(self.tag_important.id, 0.85)]
+            mock_instance.scan_document.return_value = mock_result
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--all",
+                "--dry-run",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            output = out.getvalue()
+            self.assertIn("Processing Complete", output)
+            self.assertIn("Documents processed:", output)
+
+    def test_command_filter_by_type(self):
+        """Test command with --filter-by-type option."""
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            mock_result = AIScanResult()
+            mock_instance.scan_document.return_value = mock_result
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--filter-by-type",
+                str(self.doc_type_receipt.id),
+                "--dry-run",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            # Should only scan doc2 which has the receipt type
+            self.assertEqual(mock_instance.scan_document.call_count, 1)
+
+    def test_command_invalid_document_type(self):
+        """Test command with invalid document type ID."""
+        with self.assertRaises(CommandError) as cm:
+            call_command(
+                "scan_documents_ai",
+                "--filter-by-type",
+                "99999",
+                "--dry-run",
+            )
+
+        self.assertIn("does not exist", str(cm.exception))
+
+    def test_command_date_range(self):
+        """Test command with --date-range option."""
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            mock_result = AIScanResult()
+            mock_instance.scan_document.return_value = mock_result
+
+            # Test with a date range that includes recent documents
+            today = timezone.now().date()
+            yesterday = (timezone.now() - timezone.timedelta(days=1)).date()
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--date-range",
+                str(yesterday),
+                str(today),
+                "--dry-run",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            # Should scan doc1 and doc2 (recent), not doc3 (old)
+            self.assertGreaterEqual(mock_instance.scan_document.call_count, 2)
+
+    def test_command_invalid_date_range(self):
+        """Test command with invalid date range."""
+        with self.assertRaises(CommandError) as cm:
+            call_command(
+                "scan_documents_ai",
+                "--date-range",
+                "2024-12-31",
+                "2024-01-01",  # End before start
+                "--dry-run",
+            )
+
+        self.assertIn("Start date must be before end date", str(cm.exception))
+
+    def test_command_invalid_date_format(self):
+        """Test command with invalid date format."""
+        with self.assertRaises(CommandError) as cm:
+            call_command(
+                "scan_documents_ai",
+                "--date-range",
+                "01/01/2024",  # Wrong format
+                "12/31/2024",
+                "--dry-run",
+            )
+
+        self.assertIn("Invalid date format", str(cm.exception))
+
+    def test_command_id_range(self):
+        """Test command with --id-range option."""
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            mock_result = AIScanResult()
+            mock_instance.scan_document.return_value = mock_result
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--id-range",
+                str(self.doc1.id),
+                str(self.doc1.id),
+                "--dry-run",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            # Should only scan doc1
+            self.assertEqual(mock_instance.scan_document.call_count, 1)
+
+    def test_command_confidence_threshold(self):
+        """Test command with custom confidence threshold."""
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            # Create mock result with low confidence
+            mock_result = AIScanResult()
+            mock_result.tags = [(self.tag_important.id, 0.50)]  # Low confidence
+            mock_instance.scan_document.return_value = mock_result
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--all",
+                "--dry-run",
+                "--confidence-threshold",
+                "0.40",  # Lower threshold
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            output = out.getvalue()
+            # Should show suggestions with low confidence
+            self.assertIn("suggestions generated", output.lower())
+
+    def test_command_invalid_confidence_threshold(self):
+        """Test command with invalid confidence threshold."""
+        with self.assertRaises(CommandError) as cm:
+            call_command(
+                "scan_documents_ai",
+                "--all",
+                "--confidence-threshold",
+                "1.5",  # Invalid (> 1.0)
+                "--dry-run",
+            )
+
+        self.assertIn("between 0.0 and 1.0", str(cm.exception))
+
+    def test_command_auto_apply(self):
+        """Test command with --auto-apply-high-confidence."""
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            # Create mock result with high confidence
+            mock_result = AIScanResult()
+            mock_result.tags = [(self.tag_important.id, 0.90)]
+            mock_instance.scan_document.return_value = mock_result
+
+            # Mock apply_scan_results
+            mock_instance.apply_scan_results.return_value = {
+                "applied": {
+                    "tags": [{"id": self.tag_important.id, "name": "Important"}],
+                },
+                "suggestions": {},
+            }
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--all",
+                "--auto-apply-high-confidence",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            # Should call apply_scan_results with auto_apply=True
+            self.assertTrue(mock_instance.apply_scan_results.called)
+            call_args = mock_instance.apply_scan_results.call_args
+            self.assertTrue(call_args[1]["auto_apply"])
+
+    def test_command_dry_run_does_not_apply(self):
+        """Test that dry run mode does not apply changes."""
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            mock_result = AIScanResult()
+            mock_result.tags = [(self.tag_important.id, 0.90)]
+            mock_instance.scan_document.return_value = mock_result
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--all",
+                "--dry-run",
+                "--auto-apply-high-confidence",  # Should be ignored
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            # Should not call apply_scan_results in dry-run mode
+            self.assertFalse(mock_instance.apply_scan_results.called)
+
+            output = out.getvalue()
+            self.assertIn("DRY RUN", output)
+
+    def test_command_handles_document_without_content(self):
+        """Test that command handles documents without content gracefully."""
+        # Create document without content
+        doc_no_content = Document.objects.create(
+            title="No Content Doc",
+            content="",  # Empty content
+            mime_type="application/pdf",
+            checksum="EMPTY123",
+        )
+
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            mock_result = AIScanResult()
+            mock_instance.scan_document.return_value = mock_result
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--id-range",
+                str(doc_no_content.id),
+                str(doc_no_content.id),
+                "--dry-run",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            # Should not call scan_document for empty content
+            self.assertEqual(mock_instance.scan_document.call_count, 0)
+
+    def test_command_handles_scanner_error(self):
+        """Test that command handles scanner errors gracefully."""
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            # Make scan_document raise an exception
+            mock_instance.scan_document.side_effect = Exception("Scanner error")
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--all",
+                "--dry-run",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            output = out.getvalue()
+            # Should report errors
+            self.assertIn("Errors encountered:", output)
+
+    def test_command_batch_processing(self):
+        """Test that command processes documents in batches."""
+        # Create more documents
+        for i in range(10):
+            Document.objects.create(
+                title=f"Batch Doc {i}",
+                content=f"Content {i}",
+                mime_type="application/pdf",
+                checksum=f"BATCH{i}",
+            )
+
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            mock_result = AIScanResult()
+            mock_instance.scan_document.return_value = mock_result
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--all",
+                "--dry-run",
+                "--batch-size",
+                "5",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            # Should process all documents
+            self.assertGreaterEqual(mock_instance.scan_document.call_count, 10)
+
+    def test_command_displays_suggestions(self):
+        """Test that command displays suggestions in output."""
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            # Create comprehensive scan result
+            mock_result = AIScanResult()
+            mock_result.tags = [(self.tag_important.id, 0.85)]
+            mock_result.correspondent = (self.correspondent.id, 0.80)
+            mock_result.document_type = (self.doc_type_invoice.id, 0.90)
+            mock_result.title_suggestion = "Suggested Title"
+            mock_instance.scan_document.return_value = mock_result
+
+            out = StringIO()
+            call_command(
+                "scan_documents_ai",
+                "--id-range",
+                str(self.doc1.id),
+                str(self.doc1.id),
+                "--dry-run",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            output = out.getvalue()
+            # Should display various suggestion types
+            self.assertIn("Sample Suggestions", output)
+            self.assertIn("Tags:", output)
+            self.assertIn("Correspondent:", output)
+            self.assertIn("Document Type:", output)
+
+    @override_settings(PAPERLESS_ENABLE_AI_SCANNER=False)
+    def test_command_works_when_ai_disabled(self):
+        """Test that command can run even if AI scanner is disabled in settings."""
+        with mock.patch("documents.management.commands.scan_documents_ai.get_ai_scanner") as mock_scanner:
+            mock_instance = mock.Mock()
+            mock_scanner.return_value = mock_instance
+
+            mock_result = AIScanResult()
+            mock_instance.scan_document.return_value = mock_result
+
+            out = StringIO()
+            # Should not raise an error
+            call_command(
+                "scan_documents_ai",
+                "--all",
+                "--dry-run",
+                "--no-progress-bar",
+                stdout=out,
+            )
+
+            output = out.getvalue()
+            self.assertIn("Processing Complete", output)