paperless-ngx/src/documents/sanity_checker.py

import hashlib
import logging
from collections import defaultdict
from pathlib import Path
from typing import Final

from django.conf import settings
from tqdm import tqdm

from documents.models import Document


class SanityCheckMessages:
    def __init__(self):
        self._messages = defaultdict(list)
        self.has_error = False
        self.has_warning = False

    def error(self, doc_pk, message):
        self._messages[doc_pk].append({"level": logging.ERROR, "message": message})
        self.has_error = True

    def warning(self, doc_pk, message):
        self._messages[doc_pk].append({"level": logging.WARNING, "message": message})
        self.has_warning = True

    def info(self, doc_pk, message):
        self._messages[doc_pk].append({"level": logging.INFO, "message": message})

    def log_messages(self):
        logger = logging.getLogger("paperless.sanity_checker")

        if len(self._messages) == 0:
            logger.info("Sanity checker detected no issues.")
        else:
            # Query once
            all_docs = Document.objects.all()

            for doc_pk in self._messages:
                if doc_pk is not None:
                    doc = all_docs.get(pk=doc_pk)
                    logger.info(
                        f"Detected following issue(s) with document #{doc.pk},"
                        f" titled {doc.title}",
                    )
                for msg in self._messages[doc_pk]:
                    logger.log(msg["level"], msg["message"])

    def __len__(self):
        return len(self._messages)

    def __getitem__(self, item):
        return self._messages[item]


class SanityCheckFailedException(Exception):
    pass


def check_sanity(progress=False) -> SanityCheckMessages:
    messages = SanityCheckMessages()

    present_files = {
        x.resolve() for x in Path(settings.MEDIA_ROOT).glob("**/*") if not x.is_dir()
    }

    lockfile = Path(settings.MEDIA_LOCK).resolve()
    if lockfile in present_files:
        present_files.remove(lockfile)

    for doc in tqdm(Document.objects.all(), disable=not progress):
        # Check sanity of the thumbnail
        thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()
        if not thumbnail_path.exists() or not thumbnail_path.is_file():
            messages.error(doc.pk, "Thumbnail of document does not exist.")
        else:
            if thumbnail_path in present_files:
                present_files.remove(thumbnail_path)
            try:
                _ = thumbnail_path.read_bytes()
            except OSError as e:
                messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")

        # Check sanity of the original file
        # TODO: extract method
        source_path: Final[Path] = Path(doc.source_path).resolve()
        if not source_path.exists() or not source_path.is_file():
            messages.error(doc.pk, "Original of document does not exist.")
        else:
            if source_path in present_files:
                present_files.remove(source_path)
            try:
                checksum = hashlib.md5(source_path.read_bytes()).hexdigest()
            except OSError as e:
                messages.error(doc.pk, f"Cannot read original file of document: {e}")
            else:
                if checksum != doc.checksum:
                    messages.error(
                        doc.pk,
                        "Checksum mismatch. "
                        f"Stored: {doc.checksum}, actual: {checksum}.",
                    )

        # Check sanity of the archive file.
        if doc.archive_checksum is not None and doc.archive_filename is None:
            messages.error(
                doc.pk,
                "Document has an archive file checksum, but no archive filename.",
            )
        elif doc.archive_checksum is None and doc.archive_filename is not None:
            messages.error(
                doc.pk,
                "Document has an archive file, but its checksum is missing.",
            )
        elif doc.has_archive_version:
            archive_path: Final[Path] = Path(doc.archive_path).resolve()
            if not archive_path.exists() or not archive_path.is_file():
                messages.error(doc.pk, "Archived version of document does not exist.")
            else:
                if archive_path in present_files:
                    present_files.remove(archive_path)
                try:
                    checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()
                except OSError as e:
                    messages.error(
                        doc.pk,
                        f"Cannot read archive file of document : {e}",
                    )
                else:
                    if checksum != doc.archive_checksum:
                        messages.error(
                            doc.pk,
                            "Checksum mismatch of archived document. "
                            f"Stored: {doc.archive_checksum}, "
                            f"actual: {checksum}.",
                        )

        # other document checks
        if not doc.content:
            messages.info(doc.pk, "Document contains no OCR data")

    for extra_file in present_files:
        messages.warning(None, f"Orphaned file in media dir: {extra_file}")

    return messages
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`import hashlib`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`import logging`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`from collections import defaultdict`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`from pathlib import Path`
			`from typing import Final`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
			`from django.conf import settings`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`from tqdm import tqdm`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Fixes ruff not running isort against the codebase 2023-04-20 08:10:17 -07:00			`from documents.models import Document`

added a simple sanity checker. 2020-11-25 16:04:58 +01:00
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`class SanityCheckMessages:`
			`def __init__(self):`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`self._messages = defaultdict(list)`
			`self.has_error = False`
			`self.has_warning = False`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`def error(self, doc_pk, message):`
			`self._messages[doc_pk].append({"level": logging.ERROR, "message": message})`
			`self.has_error = True`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`def warning(self, doc_pk, message):`
			`self._messages[doc_pk].append({"level": logging.WARNING, "message": message})`
			`self.has_warning = True`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`def info(self, doc_pk, message):`
			`self._messages[doc_pk].append({"level": logging.INFO, "message": message})`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`def log_messages(self):`
			`logger = logging.getLogger("paperless.sanity_checker")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`if len(self._messages) == 0:`
			`logger.info("Sanity checker detected no issues.")`
			`else:`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`# Query once`
			`all_docs = Document.objects.all()`

			`for doc_pk in self._messages:`
			`if doc_pk is not None:`
			`doc = all_docs.get(pk=doc_pk)`
Refines the sanity check header, fixes other test issues 2022-05-30 17:29:30 -07:00			`logger.info(`
			`f"Detected following issue(s) with document #{doc.pk},"`
			`f" titled {doc.title}",`
			`)`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`for msg in self._messages[doc_pk]:`
			`logger.log(msg["level"], msg["message"])`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00
			`def __len__(self):`
			`return len(self._messages)`

			`def __getitem__(self, item):`
			`return self._messages[item]`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00

better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`class SanityCheckFailedException(Exception):`
			`pass`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00

Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`def check_sanity(progress=False) -> SanityCheckMessages:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages = SanityCheckMessages()`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`present_files = {`
			`x.resolve() for x in Path(settings.MEDIA_ROOT).glob("*/") if not x.is_dir()`
			`}`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`lockfile = Path(settings.MEDIA_LOCK).resolve()`
excluded the lockfile from the sanity checker. 2020-12-10 00:29:47 +01:00			`if lockfile in present_files:`
			`present_files.remove(lockfile)`

Add --no-progress-bar option to commands 2021-04-18 15:56:00 +02:00			`for doc in tqdm(Document.objects.all(), disable=not progress):`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`# Check sanity of the thumbnail`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`thumbnail_path: Final[Path] = Path(doc.thumbnail_path).resolve()`
			`if not thumbnail_path.exists() or not thumbnail_path.is_file():`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, "Thumbnail of document does not exist.")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`else:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`if thumbnail_path in present_files:`
			`present_files.remove(thumbnail_path)`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`try:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`_ = thumbnail_path.read_bytes()`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`except OSError as e:`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, f"Cannot read thumbnail file of document: {e}")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`# Check sanity of the original file`
			`# TODO: extract method`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`source_path: Final[Path] = Path(doc.source_path).resolve()`
			`if not source_path.exists() or not source_path.is_file():`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, "Original of document does not exist.")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`else:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`if source_path in present_files:`
			`present_files.remove(source_path)`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`try:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`checksum = hashlib.md5(source_path.read_bytes()).hexdigest()`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`except OSError as e:`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, f"Cannot read original file of document: {e}")`
added checksums for archived documents. 2020-11-29 12:31:26 +01:00			`else:`
Configures ruff as the one stop linter and resolves warnings it raised 2023-03-28 09:39:30 -07:00			`if checksum != doc.checksum:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`"Checksum mismatch. "`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`f"Stored: {doc.checksum}, actual: {checksum}.",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
sanity checker testing 2021-02-10 00:52:18 +01:00			`# Check sanity of the archive file.`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`if doc.archive_checksum is not None and doc.archive_filename is None:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`"Document has an archive file checksum, but no archive filename.",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`elif doc.archive_checksum is None and doc.archive_filename is not None:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`"Document has an archive file, but its checksum is missing.",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
sanity checker testing 2021-02-10 00:52:18 +01:00			`elif doc.has_archive_version:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`archive_path: Final[Path] = Path(doc.archive_path).resolve()`
			`if not archive_path.exists() or not archive_path.is_file():`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.error(doc.pk, "Archived version of document does not exist.")`
added checksums for archived documents. 2020-11-29 12:31:26 +01:00			`else:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`if archive_path in present_files:`
			`present_files.remove(archive_path)`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`try:`
Updates to use pathlib instead of os.path 2022-05-31 08:42:11 -07:00			`checksum = hashlib.md5(archive_path.read_bytes()).hexdigest()`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`except OSError as e:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`f"Cannot read archive file of document : {e}",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00			`else:`
Configures ruff as the one stop linter and resolves warnings it raised 2023-03-28 09:39:30 -07:00			`if checksum != doc.archive_checksum:`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.error(`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`doc.pk,`
			`"Checksum mismatch of archived document. "`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`f"Stored: {doc.archive_checksum}, "`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`f"actual: {checksum}.",`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`)`
testing and fixing the sanity checker 2020-12-02 01:18:11 +01:00
			`# other document checks`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`if not doc.content:`
Fixes formatting 2022-06-01 08:08:03 -07:00			`messages.info(doc.pk, "Document contains no OCR data")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
			`for extra_file in present_files:`
Makes the sanity check messages better for users 2022-05-30 17:03:33 -07:00			`messages.warning(None, f"Orphaned file in media dir: {extra_file}")`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00
			`return messages`