paperless-ngx/src/documents/management/commands/document_archiver.py

import hashlib
import multiprocessing

import logging
import os
import shutil
import uuid

import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import transaction
from filelock import FileLock
from whoosh.writing import AsyncWriter

from documents.models import Document
from ... import index
from ...file_handling import create_source_path_directory, generate_unique_filename
from ...parsers import get_parser_class_for_mime_type


logger = logging.getLogger("paperless.management.archiver")


def handle_document(document_id):
    document = Document.objects.get(id=document_id)

    mime_type = document.mime_type

    parser_class = get_parser_class_for_mime_type(mime_type)

    if not parser_class:
        logger.error(
            f"No parser found for mime type {mime_type}, cannot "
            f"archive document {document} (ID: {document_id})"
        )
        return

    parser = parser_class(logging_group=uuid.uuid4())

    try:
        parser.parse(document.source_path, mime_type, document.get_public_filename())

        thumbnail = parser.get_optimised_thumbnail(
            document.source_path, mime_type, document.get_public_filename()
        )

        if parser.get_archive_path():
            with transaction.atomic():
                with open(parser.get_archive_path(), "rb") as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
                # I'm going to save first so that in case the file move
                # fails, the database is rolled back.
                # We also don't use save() since that triggers the filehandling
                # logic, and we don't want that yet (file not yet in place)
                document.archive_filename = generate_unique_filename(
                    document, archive_filename=True
                )
                Document.objects.filter(pk=document.pk).update(
                    archive_checksum=checksum,
                    content=parser.get_text(),
                    archive_filename=document.archive_filename,
                )
                with FileLock(settings.MEDIA_LOCK):
                    create_source_path_directory(document.archive_path)
                    shutil.move(parser.get_archive_path(), document.archive_path)
                    shutil.move(thumbnail, document.thumbnail_path)

            with index.open_index_writer() as writer:
                index.update_document(writer, document)

    except Exception as e:
        logger.exception(
            f"Error while parsing document {document} " f"(ID: {document_id})"
        )
    finally:
        parser.cleanup()


class Command(BaseCommand):

    help = """
        Using the current classification model, assigns correspondents, tags
        and document types to all documents, effectively allowing you to
        back-tag all previously indexed documents with metadata created (or
        modified) after their initial import.
    """.replace(
        "    ", ""
    )

    def add_arguments(self, parser):
        parser.add_argument(
            "-f",
            "--overwrite",
            default=False,
            action="store_true",
            help="Recreates the archived document for documents that already "
            "have an archived version.",
        )
        parser.add_argument(
            "-d",
            "--document",
            default=None,
            type=int,
            required=False,
            help="Specify the ID of a document, and this command will only "
            "run on this specific document.",
        )
        parser.add_argument(
            "--no-progress-bar",
            default=False,
            action="store_true",
            help="If set, the progress bar will not be shown",
        )

    def handle(self, *args, **options):

        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)

        overwrite = options["overwrite"]

        if options["document"]:
            documents = Document.objects.filter(pk=options["document"])
        else:
            documents = Document.objects.all()

        document_ids = list(
            map(
                lambda doc: doc.id,
                filter(lambda d: overwrite or not d.has_archive_version, documents),
            )
        )

        # Note to future self: this prevents django from reusing database
        # conncetions between processes, which is bad and does not work
        # with postgres.
        db.connections.close_all()

        try:

            logging.getLogger().handlers[0].level = logging.ERROR
            with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
                list(
                    tqdm.tqdm(
                        pool.imap_unordered(handle_document, document_ids),
                        total=len(document_ids),
                        disable=options["no_progress_bar"],
                    )
                )
        except KeyboardInterrupt:
            print("Aborting...")
added checksums for archived documents. 2020-11-29 12:31:26 +01:00			`import hashlib`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`import multiprocessing`

			`import logging`
			`import os`
			`import shutil`
			`import uuid`

proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`import tqdm`
bugfix 2020-12-05 01:21:16 +01:00			`from django import db`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`from django.conf import settings`
			`from django.core.management.base import BaseCommand`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`from django.db import transaction`
locking for the document archiver 2021-01-18 15:18:03 +01:00			`from filelock import FileLock`
Revert "associate error messages with documents" This reverts commit aa3d91a3 2021-02-22 11:52:54 +01:00			`from whoosh.writing import AsyncWriter`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
			`from documents.models import Document`
			`from ... import index`
Format Python code with black 2022-02-27 15:26:41 +01:00			`from ...file_handling import create_source_path_directory, generate_unique_filename`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`from ...parsers import get_parser_class_for_mime_type`


rework most of the logging 2021-02-05 01:10:29 +01:00			`logger = logging.getLogger("paperless.management.archiver")`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00

bugfix 2020-12-05 00:37:05 +01:00			`def handle_document(document_id):`
			`document = Document.objects.get(id=document_id)`

added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`mime_type = document.mime_type`

			`parser_class = get_parser_class_for_mime_type(mime_type)`

fixes #591 2021-02-22 11:11:04 +01:00			`if not parser_class:`
Format Python code with black 2022-02-27 15:26:41 +01:00			`logger.error(`
			`f"No parser found for mime type {mime_type}, cannot "`
			`f"archive document {document} (ID: {document_id})"`
			`)`
fixes #591 2021-02-22 11:11:04 +01:00			`return`

added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`parser = parser_class(logging_group=uuid.uuid4())`

proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`try:`
Format Python code with black 2022-02-27 15:26:41 +01:00			`parser.parse(document.source_path, mime_type, document.get_public_filename())`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00
update thumbnail in archiver, since page rotation might have changed 2021-02-21 23:29:52 +01:00			`thumbnail = parser.get_optimised_thumbnail(`
Format Python code with black 2022-02-27 15:26:41 +01:00			`document.source_path, mime_type, document.get_public_filename()`
update thumbnail in archiver, since page rotation might have changed 2021-02-21 23:29:52 +01:00			`)`

proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`if parser.get_archive_path():`
			`with transaction.atomic():`
Format Python code with black 2022-02-27 15:26:41 +01:00			`with open(parser.get_archive_path(), "rb") as f:`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`checksum = hashlib.md5(f.read()).hexdigest()`
archive filenames are now stored in the database and checked for collisions just as original filenames as well, unified method for archive version checking 2021-02-09 19:46:19 +01:00			`# I'm going to save first so that in case the file move`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`# fails, the database is rolled back.`
archive filenames are now stored in the database and checked for collisions just as original filenames as well, unified method for archive version checking 2021-02-09 19:46:19 +01:00			`# We also don't use save() since that triggers the filehandling`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`# logic, and we don't want that yet (file not yet in place)`
archive filenames are now stored in the database and checked for collisions just as original filenames as well, unified method for archive version checking 2021-02-09 19:46:19 +01:00			`document.archive_filename = generate_unique_filename(`
Format Python code with black 2022-02-27 15:26:41 +01:00			`document, archive_filename=True`
			`)`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`Document.objects.filter(pk=document.pk).update(`
			`archive_checksum=checksum,`
archive filenames are now stored in the database and checked for collisions just as original filenames as well, unified method for archive version checking 2021-02-09 19:46:19 +01:00			`content=parser.get_text(),`
Format Python code with black 2022-02-27 15:26:41 +01:00			`archive_filename=document.archive_filename,`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`)`
locking for the document archiver 2021-01-18 15:18:03 +01:00			`with FileLock(settings.MEDIA_LOCK):`
			`create_source_path_directory(document.archive_path)`
Format Python code with black 2022-02-27 15:26:41 +01:00			`shutil.move(parser.get_archive_path(), document.archive_path)`
update thumbnail in archiver, since page rotation might have changed 2021-02-21 23:29:52 +01:00			`shutil.move(thumbnail, document.thumbnail_path)`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00
fixes #591 2021-02-22 11:11:04 +01:00			`with index.open_index_writer() as writer:`
			`index.update_document(writer, document)`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`except Exception as e:`
Format Python code with black 2022-02-27 15:26:41 +01:00			`logger.exception(`
			`f"Error while parsing document {document} " f"(ID: {document_id})"`
			`)`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`finally:`
			`parser.cleanup()`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00

remove lots of unused code 2021-02-04 23:40:53 +01:00			`class Command(BaseCommand):`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
			`help = """`
			`Using the current classification model, assigns correspondents, tags`
			`and document types to all documents, effectively allowing you to`
			`back-tag all previously indexed documents with metadata created (or`
			`modified) after their initial import.`
Format Python code with black 2022-02-27 15:26:41 +01:00			`""".replace(`
			`" ", ""`
			`)`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
			`def add_arguments(self, parser):`
			`parser.add_argument(`
Format Python code with black 2022-02-27 15:26:41 +01:00			`"-f",`
			`"--overwrite",`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`default=False,`
			`action="store_true",`
			`help="Recreates the archived document for documents that already "`
Format Python code with black 2022-02-27 15:26:41 +01:00			`"have an archived version.",`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`)`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`parser.add_argument(`
Format Python code with black 2022-02-27 15:26:41 +01:00			`"-d",`
			`"--document",`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`default=None,`
			`type=int,`
			`required=False,`
			`help="Specify the ID of a document, and this command will only "`
Format Python code with black 2022-02-27 15:26:41 +01:00			`"run on this specific document.",`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`)`
Add --no-progress-bar option to commands 2021-04-18 15:56:00 +02:00			`parser.add_argument(`
			`"--no-progress-bar",`
			`default=False,`
			`action="store_true",`
Format Python code with black 2022-02-27 15:26:41 +01:00			`help="If set, the progress bar will not be shown",`
Add --no-progress-bar option to commands 2021-04-18 15:56:00 +02:00			`)`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
			`def handle(self, args, *options):`

			`os.makedirs(settings.SCRATCH_DIR, exist_ok=True)`

			`overwrite = options["overwrite"]`

Format Python code with black 2022-02-27 15:26:41 +01:00			`if options["document"]:`
			`documents = Document.objects.filter(pk=options["document"])`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`else:`
			`documents = Document.objects.all()`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
Format Python code with black 2022-02-27 15:26:41 +01:00			`document_ids = list(`
			`map(`
			`lambda doc: doc.id,`
			`filter(lambda d: overwrite or not d.has_archive_version, documents),`
bugfix 2020-12-05 00:37:05 +01:00			`)`
Format Python code with black 2022-02-27 15:26:41 +01:00			`)`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
bugfix 2020-12-05 01:21:16 +01:00			`# Note to future self: this prevents django from reusing database`
			`# conncetions between processes, which is bad and does not work`
			`# with postgres.`
			`db.connections.close_all()`

			`try:`

			`logging.getLogger().handlers[0].level = logging.ERROR`
			`with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:`
Format Python code with black 2022-02-27 15:26:41 +01:00			`list(`
			`tqdm.tqdm(`
			`pool.imap_unordered(handle_document, document_ids),`
			`total=len(document_ids),`
			`disable=options["no_progress_bar"],`
			`)`
			`)`
bugfix 2020-12-05 01:21:16 +01:00			`except KeyboardInterrupt:`
			`print("Aborting...")`