paperless-ngx/src/documents/migrations/1012_fix_archive_files.py

339 lines
12 KiB
Python
Raw Normal View History

2021-02-08 20:59:14 +01:00
# Generated by Django 3.1.6 on 2021-02-07 22:26
import datetime
2021-02-08 20:59:14 +01:00
import hashlib
import logging
import os
import shutil
2021-02-10 14:50:20 +01:00
from time import sleep
2021-02-08 20:59:14 +01:00
import pathvalidate
2021-02-08 20:59:14 +01:00
from django.conf import settings
from django.db import migrations
from django.db import models
from django.template.defaultfilters import slugify
2021-02-08 20:59:14 +01:00
from documents.file_handling import defaultdictNoStr
from documents.file_handling import many_to_dictionary
2021-02-10 14:50:20 +01:00
2021-02-08 20:59:14 +01:00
logger = logging.getLogger("paperless.migrations")
2021-02-10 14:50:20 +01:00
###############################################################################
# This is code copied straight paperless before the change.
###############################################################################
2021-02-08 20:59:14 +01:00
2022-02-27 15:26:41 +01:00
def archive_name_from_filename(filename):
2021-02-08 20:59:14 +01:00
return os.path.splitext(filename)[0] + ".pdf"
def archive_path_old(doc):
if doc.filename:
fname = archive_name_from_filename(doc.filename)
2021-02-08 20:59:14 +01:00
else:
fname = f"{doc.pk:07}.pdf"
2021-02-08 20:59:14 +01:00
2022-02-27 15:26:41 +01:00
return os.path.join(settings.ARCHIVE_DIR, fname)
2021-02-08 20:59:14 +01:00
STORAGE_TYPE_GPG = "gpg"
2021-02-08 20:59:14 +01:00
def archive_path_new(doc):
if doc.archive_filename is not None:
2022-02-27 15:26:41 +01:00
return os.path.join(settings.ARCHIVE_DIR, str(doc.archive_filename))
2021-02-08 20:59:14 +01:00
else:
return None
2021-02-08 20:59:14 +01:00
def source_path(doc):
if doc.filename:
fname = str(doc.filename)
else:
fname = f"{doc.pk:07}{doc.file_type}"
2021-02-08 20:59:14 +01:00
if doc.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
2022-02-27 15:26:41 +01:00
return os.path.join(settings.ORIGINALS_DIR, fname)
2021-02-08 20:59:14 +01:00
def generate_unique_filename(doc, archive_filename=False):
if archive_filename:
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = doc.filename
root = settings.ORIGINALS_DIR
counter = 0
while True:
new_filename = generate_filename(
doc,
counter,
archive_filename=archive_filename,
2022-02-27 15:26:41 +01:00
)
if new_filename == old_filename:
# still the same as before.
return new_filename
if os.path.exists(os.path.join(root, new_filename)):
counter += 1
else:
return new_filename
def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
path = ""
try:
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
if settings.FILENAME_FORMAT is not None:
2022-02-27 15:26:41 +01:00
tags = defaultdictNoStr(lambda: slugify(None), many_to_dictionary(doc.tags))
tag_list = pathvalidate.sanitize_filename(
2022-02-27 15:26:41 +01:00
",".join(sorted([tag.name for tag in doc.tags.all()])),
replacement_text="-",
)
if doc.correspondent:
correspondent = pathvalidate.sanitize_filename(
doc.correspondent.name,
replacement_text="-",
)
else:
correspondent = "none"
if doc.document_type:
document_type = pathvalidate.sanitize_filename(
doc.document_type.name,
replacement_text="-",
)
else:
document_type = "none"
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
path = settings.FILENAME_FORMAT.format(
2022-02-27 15:26:41 +01:00
title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(doc.created),
created_year=doc.created.year if doc.created else "none",
created_month=f"{doc.created.month:02}" if doc.created else "none",
created_day=f"{doc.created.day:02}" if doc.created else "none",
added=datetime.date.isoformat(doc.added),
added_year=doc.added.year if doc.added else "none",
added_month=f"{doc.added.month:02}" if doc.added else "none",
added_day=f"{doc.added.day:02}" if doc.added else "none",
tags=tags,
2022-02-27 15:26:41 +01:00
tag_list=tag_list,
).strip()
path = path.strip(os.sep)
except (ValueError, KeyError, IndexError):
logger.warning(
f"Invalid PAPERLESS_FILENAME_FORMAT: "
f"{settings.FILENAME_FORMAT}, falling back to default",
2022-02-27 15:26:41 +01:00
)
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if len(path) > 0:
filename = f"{path}{counter_str}{filetype_str}"
else:
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Append .gpg for encrypted files
if append_gpg and doc.storage_type == STORAGE_TYPE_GPG:
filename += ".gpg"
return filename
2021-02-10 14:50:20 +01:00
###############################################################################
# This code performs bidirection archive file transformation.
###############################################################################
2021-02-10 16:58:55 +01:00
def parse_wrapper(parser, path, mime_type, file_name):
# this is here so that I can mock this out for testing.
parser.parse(path, mime_type, file_name)
def create_archive_version(doc, retry_count=3):
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import get_parser_class_for_mime_type
2022-02-27 15:26:41 +01:00
logger.info(f"Regenerating archive document for document ID:{doc.id}")
2021-02-10 14:50:20 +01:00
parser_class = get_parser_class_for_mime_type(doc.mime_type)
for try_num in range(retry_count):
parser: DocumentParser = parser_class(None, None)
try:
2022-02-27 15:26:41 +01:00
parse_wrapper(
parser,
source_path(doc),
doc.mime_type,
os.path.basename(doc.filename),
2022-02-27 15:26:41 +01:00
)
2021-02-10 14:50:20 +01:00
doc.content = parser.get_text()
2022-02-27 15:26:41 +01:00
if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
2021-02-10 14:50:20 +01:00
doc.archive_filename = generate_unique_filename(
doc,
archive_filename=True,
2022-02-27 15:26:41 +01:00
)
2021-02-10 14:50:20 +01:00
with open(parser.get_archive_path(), "rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
2022-02-27 15:26:41 +01:00
os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
2021-02-10 14:50:20 +01:00
shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
else:
doc.archive_checksum = None
logger.error(
f"Parser did not return an archive document for document "
f"ID:{doc.id}. Removing archive document.",
2021-02-10 14:50:20 +01:00
)
doc.save()
return
except ParseError:
if try_num + 1 == retry_count:
logger.exception(
f"Unable to regenerate archive document for ID:{doc.id}. You "
f"need to invoke the document_archiver management command "
f"manually for that document.",
2021-02-10 14:50:20 +01:00
)
doc.archive_checksum = None
doc.save()
return
else:
# This is mostly here for the tika parser in docker
# environemnts. The servers for parsing need to come up first,
# and the docker setup doesn't ensure that tika is running
# before attempting migrations.
logger.error("Parse error, will try again in 5 seconds...")
sleep(5)
finally:
parser.cleanup()
2021-02-08 20:59:14 +01:00
def move_old_to_new_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
affected_document_ids = set()
old_archive_path_to_id = {}
# check for documents that have incorrect archive versions
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path])
else:
old_archive_path_to_id[old_path] = doc.id
2021-02-10 16:58:55 +01:00
# check that archive files of all unaffected documents are in place
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
if doc.id not in affected_document_ids and not os.path.isfile(old_path):
raise ValueError(
f"Archived document ID:{doc.id} does not exist at: {old_path}",
2022-02-27 15:26:41 +01:00
)
2021-02-10 16:58:55 +01:00
# check that we can regenerate affected archive versions
2021-02-08 20:59:14 +01:00
for doc_id in affected_document_ids:
from documents.parsers import get_parser_class_for_mime_type
doc = Document.objects.get(id=doc_id)
parser_class = get_parser_class_for_mime_type(doc.mime_type)
if not parser_class:
2021-02-10 16:58:55 +01:00
raise ValueError(
2021-02-10 00:52:01 +01:00
f"Document ID:{doc.id} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.",
2022-02-27 15:26:41 +01:00
)
2021-02-08 20:59:14 +01:00
for doc in Document.objects.filter(archive_checksum__isnull=False):
if doc.id in affected_document_ids:
old_path = archive_path_old(doc)
# remove affected archive versions
if os.path.isfile(old_path):
2022-02-27 15:26:41 +01:00
logger.debug(f"Removing {old_path}")
os.unlink(old_path)
else:
# Set archive path for unaffected files
2021-02-09 20:54:02 +01:00
doc.archive_filename = archive_name_from_filename(doc.filename)
Document.objects.filter(id=doc.id).update(
archive_filename=doc.archive_filename,
)
2021-02-08 20:59:14 +01:00
# regenerate archive documents
for doc_id in affected_document_ids:
doc = Document.objects.get(id=doc_id)
2021-02-10 14:50:20 +01:00
create_archive_version(doc)
2021-02-08 20:59:14 +01:00
def move_new_to_old_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
old_archive_paths = set()
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
if old_archive_path in old_archive_paths:
raise ValueError(
f"Cannot migrate: Archive file name {old_archive_path} of "
f"document {doc.filename} would clash with another archive "
f"filename.",
2022-02-27 15:26:41 +01:00
)
2021-02-08 20:59:14 +01:00
old_archive_paths.add(old_archive_path)
if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
raise ValueError(
f"Cannot migrate: Cannot move {new_archive_path} to "
f"{old_archive_path}: file already exists.",
2021-02-08 20:59:14 +01:00
)
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
2021-02-08 22:49:01 +01:00
if new_archive_path != old_archive_path:
logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
shutil.move(new_archive_path, old_archive_path)
2021-02-08 20:59:14 +01:00
class Migration(migrations.Migration):
dependencies = [
2022-02-27 15:26:41 +01:00
("documents", "1011_auto_20210101_2340"),
2021-02-08 20:59:14 +01:00
]
operations = [
migrations.AddField(
2022-02-27 15:26:41 +01:00
model_name="document",
name="archive_filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current archive filename in storage",
max_length=1024,
null=True,
unique=True,
verbose_name="archive filename",
),
),
migrations.AlterField(
2022-02-27 15:26:41 +01:00
model_name="document",
name="filename",
field=models.FilePathField(
default=None,
editable=False,
help_text="Current filename in storage",
max_length=1024,
null=True,
unique=True,
verbose_name="filename",
),
),
2022-02-27 15:26:41 +01:00
migrations.RunPython(move_old_to_new_locations, move_new_to_old_locations),
2021-02-08 20:59:14 +01:00
]