paperless-ngx/src/documents/sanity_checker.py

import hashlib
import logging
import uuid
from collections import defaultdict
from pathlib import Path
from typing import Final

from celery import states
from django.conf import settings
from django.utils import timezone
from tqdm import tqdm

from documents.models import Document
from documents.models import PaperlessTask


class SanityCheckMessages:
    def __init__(self):
        self._messages: dict[int, list[dict]] = defaultdict(list)
        self.has_error = False
        self.has_warning = False

    def error(self, doc_pk, message):
        self._messages[doc_pk].append({"level": logging.ERROR, "message": message})
        self.has_error = True

    def warning(self, doc_pk, message):
        self._messages[doc_pk].append({"level": logging.WARNING, "message": message})
        self.has_warning = True

    def info(self, doc_pk, message):
        self._messages[doc_pk].append({"level": logging.INFO, "message": message})

    def log_messages(self):
        logger = logging.getLogger("paperless.sanity_checker")

        if len(self._messages) == 0:
            logger.info("Sanity checker detected no issues.")
        else:
            # Query once
            all_docs = Document.global_objects.all()

            for doc_pk in self._messages:
                if doc_pk is not None:
                    doc = all_docs.get(pk=doc_pk)
                    logger.info(
                        f"Detected following issue(s) with document #{doc.pk},"
                        f" titled {doc.title}",
                    )
                for msg in self._messages[doc_pk]:
                    logger.log(msg["level"], msg["message"])

    def __len__(self):
        return len(self._messages)

    def __getitem__(self, item):
        return self._messages[item]


class SanityCheckFailedException(Exception):
    pass


def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:
    paperless_task = PaperlessTask.objects.create(
        task_id=uuid.uuid4(),
        type=PaperlessTask.TaskType.SCHEDULED_TASK
        if scheduled
        else PaperlessTask.TaskType.MANUAL_TASK,
        task_name=PaperlessTask.TaskName.CHECK_SANITY,
        status=states.STARTED,
        date_created=timezone.now(),
        date_started=timezone.now(),
    )
    messages = SanityCheckMessages()

    present_files = {
        x.resolve() for x in Path(settings.MEDIA_ROOT).glob("**/*") if not x.is_dir()
    }

    lockfile = Path(settings.MEDIA_LOCK).resolve()
    if lockfile in present_files:
        present_files.remove(lockfile)

    for doc in tqdm(Document.global_objects.all(), disable=not progress):
        # Check sanity of the thumbnail
        thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
        if not thumbnail_path.exists() or not thumbnail_path.is_file():
            messages.error(doc.pk, "Thumbnail of document does not exist.")
        else:
            if thumbnail_path in present_files:
                present_files.remove(thumbnail_path)
            try:
                _ = thumbnail_path.read_bytes()
            except OSError as e:
                messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")

        # Check sanity of the original file
        # TODO: extract method
        source_path: Final[Path] = Path(doc.source_path).resolve()
        if not source_path.exists() or not source_path.is_file():
            messages.error(doc.pk, "Original of document does not exist.")
        else:
            if source_path in present_files:
                present_files.remove(source_path)
            try:
                checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
            except OSError as e:
                messages.error(doc.pk, f"Cannot read original file of document: {e}")
            else:
                if checksum != doc.checksum:
                    messages.error(
                        doc.pk,
                        "Checksum mismatch. "
                        f"Stored: {doc.checksum}, actual: {checksum}.",
                    )

        # Check sanity of the archive file.
        if doc.archive_checksum is not None and doc.archive_filename is None:
            messages.error(
                doc.pk,
                "Document has an archive file checksum, but no archive filename.",
            )
        elif doc.archive_checksum is None and doc.archive_filename is not None:
            messages.error(
                doc.pk,
                "Document has an archive file, but its checksum is missing.",
            )
        elif doc.has_archive_version:
            archive_path: Final[Path] = Path(doc.archive_path).resolve()
            if not archive_path.exists() or not archive_path.is_file():
                messages.error(doc.pk, "Archived version of document does not exist.")
            else:
                if archive_path in present_files:
                    present_files.remove(archive_path)
                try:
                    checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
                except OSError as e:
                    messages.error(
                        doc.pk,
                        f"Cannot read archive file of document : {e}",
                    )
                else:
                    if checksum != doc.archive_checksum:
                        messages.error(
                            doc.pk,
                            "Checksum mismatch of archived document. "
                            f"Stored: {doc.archive_checksum}, "
                            f"actual: {checksum}.",
                        )

        # other document checks
        if not doc.content:
            messages.info(doc.pk, "Document contains no OCR data")

    for extra_file in present_files:
        messages.warning(None, f"Orphaned file in media dir: {extra_file}")

    paperless_task.status = states.SUCCESS if not messages.has_error else states.FAILURE
    # result is concatenated messages
    paperless_task.result = f"{len(messages)} issues found."
    if messages.has_error:
        paperless_task.result += " Check logs for details."
    paperless_task.date_done = timezone.now()
    paperless_task.save(update_fields=["status", "result", "date_done"])
    return messages
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`import hashlib`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`import logging`
Enhancement: system status report sanity check, simpler classifier check, styling updates (#9106) 2025-02-26 14:12:20 -08:00			`import uuid`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`from collections import defaultdict`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`from pathlib import Path`
			`from typing import Final`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Enhancement: system status report sanity check, simpler classifier check, styling updates (#9106) 2025-02-26 14:12:20 -08:00			`from celery import states`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`from django.conf import settings`
Enhancement: system status report sanity check, simpler classifier check, styling updates (#9106) 2025-02-26 14:12:20 -08:00			`from django.utils import timezone`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`from tqdm import tqdm`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Fixes ruff not running isort against the codebase 2023-04-20 08:10:17 -07:00			`from documents.models import Document`
Enhancement: system status report sanity check, simpler classifier check, styling updates (#9106) 2025-02-26 14:12:20 -08:00			`from documents.models import PaperlessTask`
Fixes ruff not running isort against the codebase 2023-04-20 08:10:17 -07:00
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`class SanityCheckMessages:`
			`def __init__(self):`
Handcrafts SQL queries a little more to reduce the query count and/or the amount of returned data (#6489) 2024-04-30 07:37:09 -07:00			`self._messages: dict[int, list[dict]] = defaultdict(list)`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`self.has_error = False`
			`self.has_warning = False`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`def error(self, doc_pk, message):`
			`self._messages[doc_pk].append({"level": logging.ERROR, "message": message})`
			`self.has_error = True`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`def warning(self, doc_pk, message):`
			`self._messages[doc_pk].append({"level": logging.WARNING, "message": message})`
			`self.has_warning = True`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`def info(self, doc_pk, message):`
			`self._messages[doc_pk].append({"level": logging.INFO, "message": message})`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`def log_messages(self):`
			`logger = logging.getLogger("paperless.sanity_checker")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`if len(self._messages) == 0:`
			`logger.info("Sanity checker detected no issues.")`
			`else:`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`# Query once`
Fix: dont exclude documents in trash from sanity check (#7133) 2024-07-01 13:47:05 -07:00			`all_docs = Document.global_objects.all()`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00
			`for doc_pk in self._messages:`
			`if doc_pk is not None:`
			`doc = all_docs.get(pk=doc_pk)`
Refines the sanity check header, fixes other test issues 2022-05-30 17:29:30 -07:00			`logger.info(`
			`f"Detected following issue(s) with document #{doc.pk},"`
			`f" titled {doc.title}",`
			`)`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`for msg in self._messages[doc_pk]:`
			`logger.log(msg["level"], msg["message"])`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00
			`def __len__(self):`
			`return len(self._messages)`

			`def __getitem__(self, item):`
			`return self._messages[item]`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00

better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`class SanityCheckFailedException(Exception):`
			`pass`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00

Enhancement: system status report sanity check, simpler classifier check, styling updates (#9106) 2025-02-26 14:12:20 -08:00			`def check_sanity(*, progress=False, scheduled=True) -> SanityCheckMessages:`
			`paperless_task = PaperlessTask.objects.create(`
			`task_id=uuid.uuid4(),`
			`type=PaperlessTask.TaskType.SCHEDULED_TASK`
			`if scheduled`
			`else PaperlessTask.TaskType.MANUAL_TASK,`
			`task_name=PaperlessTask.TaskName.CHECK_SANITY,`
			`status=states.STARTED,`
			`date_created=timezone.now(),`
			`date_started=timezone.now(),`
			`)`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages = SanityCheckMessages()`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`present_files = {`
			`x.resolve() for x in Path(settings.MEDIA_ROOT).glob("*/") if not x.is_dir()`
			`}`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`lockfile = Path(settings.MEDIA_LOCK).resolve()`
excluded the lockfile from the sanity checker. 2020-12-10 00:29:47 +01:00			`if lockfile in present_files:`
			`present_files.remove(lockfile)`

Fix: dont exclude documents in trash from sanity check (#7133) 2024-07-01 13:47:05 -07:00			`for doc in tqdm(Document.global_objects.all(), disable=not progress):`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`# Check sanity of the thumbnail`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()`
			`if not thumbnail_path.exists() or not thumbnail_path.is_file():`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, "Thumbnail of document does not exist.")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`else:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`if thumbnail_path in present_files:`
			`present_files.remove(thumbnail_path)`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`try:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`_ = thumbnail_path.read_bytes()`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`except OSError as e:`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`# Check sanity of the original file`
			`# TODO: extract method`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`source_path: Final[Path] = Path(doc.source_path).resolve()`
			`if not source_path.exists() or not source_path.is_file():`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, "Original of document does not exist.")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`else:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`if source_path in present_files:`
			`present_files.remove(source_path)`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`try:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`checksum = hashlib.md5(source_path.read_bytes()).hexdigest()`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`except OSError as e:`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, f"Cannot read original file of document: {e}")`
added checksums for archived documents. 2020-11-29 12:31:26 +01:00			`else:`
Configures ruff as the one stop linter and resolves warnings it raised 2023-03-28 09:39:30 -07:00			`if checksum != doc.checksum:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`"Checksum mismatch. "`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`f"Stored: {doc.checksum}, actual: {checksum}.",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
sanity checker testing 2021-02-10 00:52:18 +01:00			`# Check sanity of the archive file.`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`if doc.archive_checksum is not None and doc.archive_filename is None:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`"Document has an archive file checksum, but no archive filename.",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`elif doc.archive_checksum is None and doc.archive_filename is not None:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`"Document has an archive file, but its checksum is missing.",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
sanity checker testing 2021-02-10 00:52:18 +01:00			`elif doc.has_archive_version:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`archive_path: Final[Path] = Path(doc.archive_path).resolve()`
			`if not archive_path.exists() or not archive_path.is_file():`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, "Archived version of document does not exist.")`
added checksums for archived documents. 2020-11-29 12:31:26 +01:00			`else:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`if archive_path in present_files:`
			`present_files.remove(archive_path)`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`try:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`except OSError as e:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`f"Cannot read archive file of document : {e}",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`else:`
Configures ruff as the one stop linter and resolves warnings it raised 2023-03-28 09:39:30 -07:00			`if checksum != doc.archive_checksum:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`"Checksum mismatch of archived document. "`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`f"Stored: {doc.archive_checksum}, "`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`f"actual: {checksum}.",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00
			`# other document checks`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`if not doc.content:`
Fixes formatting 2022-06-01 08:08:03 -07:00			`messages.info(doc.pk, "Document contains no OCR data")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
			`for extra_file in present_files:`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.warning(None, f"Orphaned file in media dir: {extra_file}")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Enhancement: system status report sanity check, simpler classifier check, styling updates (#9106) 2025-02-26 14:12:20 -08:00			`paperless_task.status = states.SUCCESS if not messages.has_error else states.FAILURE`
			`# result is concatenated messages`
			`paperless_task.result = f"{len(messages)} issues found."`
			`if messages.has_error:`
			`paperless_task.result += " Check logs for details."`
			`paperless_task.date_done = timezone.now()`
			`paperless_task.save(update_fields=["status", "result", "date_done"])`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`return messages`