paperless-ngx/src/documents/management/commands/document_archiver.py

153 lines
4.9 KiB
Python
Raw Normal View History

import hashlib
import multiprocessing
import logging
import os
import shutil
import uuid
import tqdm
2020-12-05 01:21:16 +01:00
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import transaction
2021-01-18 15:18:03 +01:00
from filelock import FileLock
from whoosh.writing import AsyncWriter
from documents.models import Document
from ... import index
2022-02-27 15:26:41 +01:00
from ...file_handling import create_source_path_directory, generate_unique_filename
from ...parsers import get_parser_class_for_mime_type
2021-02-05 01:10:29 +01:00
logger = logging.getLogger("paperless.management.archiver")
2020-12-05 00:37:05 +01:00
def handle_document(document_id):
document = Document.objects.get(id=document_id)
mime_type = document.mime_type
parser_class = get_parser_class_for_mime_type(mime_type)
2021-02-22 11:11:04 +01:00
if not parser_class:
2022-02-27 15:26:41 +01:00
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})"
)
2021-02-22 11:11:04 +01:00
return
parser = parser_class(logging_group=uuid.uuid4())
try:
2022-02-27 15:26:41 +01:00
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_optimised_thumbnail(
2022-02-27 15:26:41 +01:00
document.source_path, mime_type, document.get_public_filename()
)
if parser.get_archive_path():
with transaction.atomic():
2022-02-27 15:26:41 +01:00
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
2022-02-27 15:26:41 +01:00
document, archive_filename=True
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
2022-02-27 15:26:41 +01:00
archive_filename=document.archive_filename,
)
2021-01-18 15:18:03 +01:00
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
2022-02-27 15:26:41 +01:00
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
2021-02-22 11:11:04 +01:00
with index.open_index_writer() as writer:
index.update_document(writer, document)
except Exception as e:
2022-02-27 15:26:41 +01:00
logger.exception(
f"Error while parsing document {document} " f"(ID: {document_id})"
)
finally:
parser.cleanup()
2021-02-04 23:40:53 +01:00
class Command(BaseCommand):
help = """
Using the current classification model, assigns correspondents, tags
and document types to all documents, effectively allowing you to
back-tag all previously indexed documents with metadata created (or
modified) after their initial import.
2022-02-27 15:26:41 +01:00
""".replace(
" ", ""
)
def add_arguments(self, parser):
parser.add_argument(
2022-02-27 15:26:41 +01:00
"-f",
"--overwrite",
default=False,
action="store_true",
help="Recreates the archived document for documents that already "
2022-02-27 15:26:41 +01:00
"have an archived version.",
)
parser.add_argument(
2022-02-27 15:26:41 +01:00
"-d",
"--document",
default=None,
type=int,
required=False,
help="Specify the ID of a document, and this command will only "
2022-02-27 15:26:41 +01:00
"run on this specific document.",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
2022-02-27 15:26:41 +01:00
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
overwrite = options["overwrite"]
2022-02-27 15:26:41 +01:00
if options["document"]:
documents = Document.objects.filter(pk=options["document"])
else:
documents = Document.objects.all()
2022-02-27 15:26:41 +01:00
document_ids = list(
map(
lambda doc: doc.id,
filter(lambda d: overwrite or not d.has_archive_version, documents),
2020-12-05 00:37:05 +01:00
)
2022-02-27 15:26:41 +01:00
)
2020-12-05 01:21:16 +01:00
# Note to future self: this prevents django from reusing database
# conncetions between processes, which is bad and does not work
# with postgres.
db.connections.close_all()
try:
logging.getLogger().handlers[0].level = logging.ERROR
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
2022-02-27 15:26:41 +01:00
list(
tqdm.tqdm(
pool.imap_unordered(handle_document, document_ids),
total=len(document_ids),
disable=options["no_progress_bar"],
)
)
2020-12-05 01:21:16 +01:00
except KeyboardInterrupt:
print("Aborting...")