paperless-ngx/src/documents/management/commands/document_importer.py

274 lines
10 KiB
Python
Raw Normal View History

import json
2020-12-22 15:50:27 +01:00
import logging
import os
import shutil
2020-12-21 17:35:05 +01:00
from contextlib import contextmanager
from pathlib import Path
2020-12-22 15:50:27 +01:00
import tqdm
from django.conf import settings
from django.contrib.auth.models import Permission
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import FieldDoesNotExist
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.core.serializers.base import DeserializationError
from django.db import IntegrityError
from django.db import transaction
from django.db.models.signals import m2m_changed
from django.db.models.signals import post_save
from filelock import FileLock
from documents.file_handling import create_source_path_directory
from documents.models import Document
from documents.parsers import run_convert
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.signals.handlers import update_filename_and_move_files
from paperless import version
2020-12-21 17:35:05 +01:00
@contextmanager
def disable_signal(sig, receiver, sender):
try:
sig.disconnect(receiver=receiver, sender=sender)
yield
finally:
sig.connect(receiver=receiver, sender=sender)
2021-02-04 23:40:53 +01:00
class Command(BaseCommand):
help = """
Using a manifest.json file, load the data from there, and import the
documents it refers to.
2022-02-27 15:26:41 +01:00
""".replace(
" ",
"",
2022-02-27 15:26:41 +01:00
)
def add_arguments(self, parser):
parser.add_argument("source")
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
2022-02-27 15:26:41 +01:00
help="If set, the progress bar will not be shown",
)
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
self.source = None
self.manifest = None
self.version = None
def handle(self, *args, **options):
2020-12-22 15:50:27 +01:00
logging.getLogger().handlers[0].level = logging.ERROR
self.source = Path(options["source"]).resolve()
if not self.source.exists():
raise CommandError("That path doesn't exist")
if not os.access(self.source, os.R_OK):
raise CommandError("That path doesn't appear to be readable")
manifest_paths = []
main_manifest_path = self.source / "manifest.json"
self._check_manifest_exists(main_manifest_path)
with main_manifest_path.open() as infile:
self.manifest = json.load(infile)
manifest_paths.append(main_manifest_path)
2023-01-20 20:51:03 +01:00
for file in Path(self.source).glob("**/*-manifest.json"):
with file.open() as infile:
self.manifest += json.load(infile)
2023-01-20 20:51:03 +01:00
manifest_paths.append(file)
version_path = self.source / "version.json"
if version_path.exists():
with version_path.open() as infile:
self.version = json.load(infile)["version"]
# Provide an initial warning if needed to the user
if self.version != version.__full_version_str__:
self.stdout.write(
self.style.WARNING(
"Version mismatch: "
f"Currently {version.__full_version_str__},"
f" importing {self.version}."
" Continuing, but import may fail.",
),
)
else:
2022-05-05 09:17:51 -07:00
self.stdout.write(self.style.NOTICE("No version.json file located"))
self._check_manifest_valid()
2022-02-27 15:26:41 +01:00
with disable_signal(
post_save,
receiver=update_filename_and_move_files,
sender=Document,
), disable_signal(
m2m_changed,
receiver=update_filename_and_move_files,
sender=Document.tags.through,
2022-02-27 15:26:41 +01:00
):
# Fill up the database with whatever is in the manifest
try:
with transaction.atomic():
for manifest_path in manifest_paths:
# delete these since pk can change, re-created from import
ContentType.objects.all().delete()
Permission.objects.all().delete()
call_command("loaddata", manifest_path)
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
self.stdout.write(self.style.ERROR("Database import failed"))
if (
self.version is not None
and self.version != version.__full_version_str__
):
self.stdout.write(
self.style.ERROR(
"Version mismatch: "
f"Currently {version.__full_version_str__},"
f" importing {self.version}",
),
)
raise e
else:
self.stdout.write(
self.style.ERROR("No version information present"),
)
raise e
self._import_files_from_manifest(options["no_progress_bar"])
self.stdout.write("Updating search index...")
call_command(
"document_index",
"reindex",
no_progress_bar=options["no_progress_bar"],
)
2020-12-22 15:53:04 +01:00
@staticmethod
def _check_manifest_exists(path: Path):
if not path.exists():
raise CommandError(
"That directory doesn't appear to contain a manifest.json file.",
)
def _check_manifest_valid(self):
"""
Attempts to verify the manifest is valid. Namely checking the files
referred to exist and the files can be read from
"""
self.stdout.write("Checking the manifest")
for record in self.manifest:
if record["model"] != "documents.document":
continue
2017-05-08 14:54:48 +02:00
if EXPORTER_FILE_NAME not in record:
raise CommandError(
2022-02-27 15:26:41 +01:00
"The manifest file contains a record which does not "
"refer to an actual document file.",
)
2017-05-08 14:54:48 +02:00
doc_file = record[EXPORTER_FILE_NAME]
doc_path = self.source / doc_file
if not doc_path.exists():
raise CommandError(
'The manifest file refers to "{}" which does not '
"appear to be in the source directory.".format(doc_file),
)
try:
with doc_path.open(mode="rb") as infile:
infile.read(1)
except Exception as e:
raise CommandError(
f"Failed to read from original file {doc_path}",
) from e
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
doc_archive_path = self.source / archive_file
if not doc_archive_path.exists():
raise CommandError(
f"The manifest file refers to {archive_file} which "
f"does not appear to be in the source directory.",
)
try:
with doc_archive_path.open(mode="rb") as infile:
infile.read(1)
except Exception as e:
raise CommandError(
f"Failed to read from archive file {doc_archive_path}",
) from e
def _import_files_from_manifest(self, progress_bar_disable):
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ARCHIVE_DIR, exist_ok=True)
self.stdout.write("Copy files into paperless...")
2022-02-27 15:26:41 +01:00
manifest_documents = list(
filter(lambda r: r["model"] == "documents.document", self.manifest),
2022-02-27 15:26:41 +01:00
)
2020-12-22 15:50:27 +01:00
2022-02-27 15:26:41 +01:00
for record in tqdm.tqdm(manifest_documents, disable=progress_bar_disable):
document = Document.objects.get(pk=record["pk"])
2017-05-08 15:01:01 +02:00
doc_file = record[EXPORTER_FILE_NAME]
document_path = os.path.join(self.source, doc_file)
if EXPORTER_THUMBNAIL_NAME in record:
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
thumbnail_path = Path(os.path.join(self.source, thumb_file)).resolve()
else:
thumbnail_path = None
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
archive_path = os.path.join(self.source, archive_file)
else:
archive_path = None
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
2020-12-08 13:54:35 +01:00
with FileLock(settings.MEDIA_LOCK):
if os.path.isfile(document.source_path):
raise FileExistsError(document.source_path)
2020-12-08 13:54:35 +01:00
create_source_path_directory(document.source_path)
shutil.copy2(document_path, document.source_path)
if thumbnail_path:
if thumbnail_path.suffix in {".png", ".PNG"}:
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{thumbnail_path}[0]",
output_file=str(document.thumbnail_path),
)
else:
shutil.copy2(thumbnail_path, document.thumbnail_path)
2020-12-08 13:54:35 +01:00
if archive_path:
create_source_path_directory(document.archive_path)
2021-02-09 19:46:32 +01:00
# TODO: this assumes that the export is valid and
# archive_filename is present on all documents with
# archived files
shutil.copy2(archive_path, document.archive_path)
document.save()