paperless-ngx/src/documents/management/commands/document_archiver.py

import hashlib
import multiprocessing

import logging
import os
import shutil
import uuid

import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import transaction
from filelock import FileLock
from whoosh.writing import AsyncWriter

from documents.models import Document
from ... import index
from ...file_handling import create_source_path_directory
from ...parsers import get_parser_class_for_mime_type


logger = logging.getLogger("paperless.management.archiver")


def handle_document(document_id):
    document = Document.objects.get(id=document_id)

    mime_type = document.mime_type

    parser_class = get_parser_class_for_mime_type(mime_type)

    parser = parser_class(logging_group=uuid.uuid4())

    try:
        parser.parse(document.source_path, mime_type)

        if parser.get_archive_path():
            with transaction.atomic():
                with open(parser.get_archive_path(), 'rb') as f:
                    checksum = hashlib.md5(f.read()).hexdigest()
                # i'm going to save first so that in case the file move
                # fails, the database is rolled back.
                # we also don't use save() since that triggers the filehandling
                # logic, and we don't want that yet (file not yet in place)
                Document.objects.filter(pk=document.pk).update(
                    archive_checksum=checksum,
                    content=parser.get_text()
                )
                with FileLock(settings.MEDIA_LOCK):
                    create_source_path_directory(document.archive_path)
                    shutil.move(parser.get_archive_path(),
                                document.archive_path)

        with AsyncWriter(index.open_index()) as writer:
            index.update_document(writer, document)

    except Exception as e:
        logger.error(f"Error while parsing document {document}: {str(e)}")
    finally:
        parser.cleanup()


class Command(BaseCommand):

    help = """
        Using the current classification model, assigns correspondents, tags
        and document types to all documents, effectively allowing you to
        back-tag all previously indexed documents with metadata created (or
        modified) after their initial import.
    """.replace("    ", "")

    def add_arguments(self, parser):
        parser.add_argument(
            "-f", "--overwrite",
            default=False,
            action="store_true",
            help="Recreates the archived document for documents that already "
                 "have an archived version."
        )
        parser.add_argument(
            "-d", "--document",
            default=None,
            type=int,
            required=False,
            help="Specify the ID of a document, and this command will only "
                 "run on this specific document."
        )

    def handle(self, *args, **options):

        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)

        overwrite = options["overwrite"]

        if options['document']:
            documents = Document.objects.filter(pk=options['document'])
        else:
            documents = Document.objects.all()

        document_ids = list(map(
            lambda doc: doc.id,
            filter(
                lambda d: overwrite or not d.archive_checksum,
                documents
            )
        ))

        # Note to future self: this prevents django from reusing database
        # conncetions between processes, which is bad and does not work
        # with postgres.
        db.connections.close_all()

        try:

            logging.getLogger().handlers[0].level = logging.ERROR
            with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
                list(tqdm.tqdm(
                    pool.imap_unordered(
                        handle_document,
                        document_ids
                    ),
                    total=len(document_ids)
                ))
        except KeyboardInterrupt:
            print("Aborting...")
added checksums for archived documents. 2020-11-29 12:31:26 +01:00			`import hashlib`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`import multiprocessing`

			`import logging`
			`import os`
			`import shutil`
			`import uuid`

proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`import tqdm`
bugfix 2020-12-05 01:21:16 +01:00			`from django import db`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`from django.conf import settings`
			`from django.core.management.base import BaseCommand`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`from django.db import transaction`
locking for the document archiver 2021-01-18 15:18:03 +01:00			`from filelock import FileLock`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`from whoosh.writing import AsyncWriter`

			`from documents.models import Document`
			`from ... import index`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`from ...file_handling import create_source_path_directory`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`from ...parsers import get_parser_class_for_mime_type`


rework most of the logging 2021-02-05 01:10:29 +01:00			`logger = logging.getLogger("paperless.management.archiver")`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00

bugfix 2020-12-05 00:37:05 +01:00			`def handle_document(document_id):`
			`document = Document.objects.get(id=document_id)`

added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00			`mime_type = document.mime_type`

			`parser_class = get_parser_class_for_mime_type(mime_type)`

			`parser = parser_class(logging_group=uuid.uuid4())`

proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`try:`
			`parser.parse(document.source_path, mime_type)`

			`if parser.get_archive_path():`
			`with transaction.atomic():`
			`with open(parser.get_archive_path(), 'rb') as f:`
			`checksum = hashlib.md5(f.read()).hexdigest()`
			`# i'm going to save first so that in case the file move`
			`# fails, the database is rolled back.`
			`# we also don't use save() since that triggers the filehandling`
			`# logic, and we don't want that yet (file not yet in place)`
			`Document.objects.filter(pk=document.pk).update(`
			`archive_checksum=checksum,`
			`content=parser.get_text()`
			`)`
locking for the document archiver 2021-01-18 15:18:03 +01:00			`with FileLock(settings.MEDIA_LOCK):`
			`create_source_path_directory(document.archive_path)`
			`shutil.move(parser.get_archive_path(),`
			`document.archive_path)`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00
			`with AsyncWriter(index.open_index()) as writer:`
			`index.update_document(writer, document)`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`except Exception as e:`
			`logger.error(f"Error while parsing document {document}: {str(e)}")`
			`finally:`
			`parser.cleanup()`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00

remove lots of unused code 2021-02-04 23:40:53 +01:00			`class Command(BaseCommand):`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
			`help = """`
			`Using the current classification model, assigns correspondents, tags`
			`and document types to all documents, effectively allowing you to`
			`back-tag all previously indexed documents with metadata created (or`
			`modified) after their initial import.`
			`""".replace(" ", "")`

			`def add_arguments(self, parser):`
			`parser.add_argument(`
			`"-f", "--overwrite",`
			`default=False,`
			`action="store_true",`
			`help="Recreates the archived document for documents that already "`
			`"have an archived version."`
			`)`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`parser.add_argument(`
			`"-d", "--document",`
			`default=None,`
			`type=int,`
			`required=False,`
			`help="Specify the ID of a document, and this command will only "`
			`"run on this specific document."`
			`)`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
			`def handle(self, args, *options):`

			`os.makedirs(settings.SCRATCH_DIR, exist_ok=True)`

			`overwrite = options["overwrite"]`

proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`if options['document']:`
			`documents = Document.objects.filter(pk=options['document'])`
			`else:`
			`documents = Document.objects.all()`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
bugfix 2020-12-05 00:37:05 +01:00			`document_ids = list(map(`
			`lambda doc: doc.id,`
			`filter(`
			`lambda d: overwrite or not d.archive_checksum,`
			`documents`
			`)`
proper document archiver with progress bar. 2020-12-03 01:04:52 +01:00			`))`
added a simple document archiver that produces archived versions of all originals. 2020-11-28 11:49:07 +01:00
bugfix 2020-12-05 01:21:16 +01:00			`# Note to future self: this prevents django from reusing database`
			`# conncetions between processes, which is bad and does not work`
			`# with postgres.`
			`db.connections.close_all()`

			`try:`

			`logging.getLogger().handlers[0].level = logging.ERROR`
			`with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:`
			`list(tqdm.tqdm(`
			`pool.imap_unordered(`
			`handle_document,`
			`document_ids`
			`),`
			`total=len(document_ids)`
			`))`
			`except KeyboardInterrupt:`
			`print("Aborting...")`