paperless-ngx/src/documents/tests/test_consumer.py

1334 lines
45 KiB
Python
Raw Normal View History

import datetime
2020-11-16 23:16:37 +01:00
import os
import re
2020-11-29 19:22:49 +01:00
import shutil
import stat
2020-11-16 23:16:37 +01:00
import tempfile
import zoneinfo
from pathlib import Path
from unittest import TestCase as UnittestTestCase
2020-11-16 23:16:37 +01:00
from unittest import mock
from unittest.mock import MagicMock
from dateutil import tz
2021-02-07 18:23:54 +01:00
from django.conf import settings
from django.contrib.auth.models import Group
from django.contrib.auth.models import User
from django.test import TestCase
from django.test import override_settings
from django.utils import timezone
from guardian.core import ObjectPermissionChecker
from documents.consumer import ConsumerError
from documents.data_models import DocumentMetadataOverrides
from documents.models import Correspondent
from documents.models import CustomField
from documents.models import Document
from documents.models import DocumentType
from documents.models import FileInfo
from documents.models import StoragePath
from documents.models import Tag
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.plugins.helpers import ProgressStatusOptions
from documents.tasks import sanity_check
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import GetConsumerMixin
2016-02-11 12:25:23 +00:00
class TestAttributes(UnittestTestCase):
2016-02-11 22:05:55 +00:00
TAGS = ("tag1", "tag2", "tag3")
2016-03-24 19:18:33 +00:00
def _test_guess_attributes_from_name(self, filename, sender, title, tags):
file_info = FileInfo.from_filename(filename)
2016-03-24 19:18:33 +00:00
if sender:
self.assertEqual(file_info.correspondent.name, sender, filename)
else:
self.assertIsNone(file_info.correspondent, filename)
2016-03-24 19:18:33 +00:00
self.assertEqual(file_info.title, title, filename)
2016-03-24 19:18:33 +00:00
self.assertEqual(tuple(t.name for t in file_info.tags), tags, filename)
2016-02-11 12:25:23 +00:00
2016-03-07 21:48:47 +02:00
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
self._test_guess_attributes_from_name(
"- weird but should not break.pdf",
None,
"- weird but should not break",
(),
2016-03-07 21:48:47 +02:00
)
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
self._test_guess_attributes_from_name(
"weird but should not break -.pdf",
None,
"weird but should not break -",
(),
2016-03-07 21:48:47 +02:00
)
2016-10-26 09:32:59 +00:00
class TestFieldPermutations(TestCase):
2016-03-24 19:18:33 +00:00
valid_dates = (
"20150102030405Z",
"20150102Z",
)
valid_correspondents = ["timmy", "Dr. McWheelie", "Dash Gor-don", "o Θεpμaoτής", ""]
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Tίτλoς", ""]
2016-03-24 19:18:33 +00:00
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
2022-02-27 15:26:41 +01:00
def _test_guessed_attributes(
self,
filename,
created=None,
correspondent=None,
title=None,
tags=None,
2022-02-27 15:26:41 +01:00
):
info = FileInfo.from_filename(filename)
2016-03-24 19:18:33 +00:00
# Created
if created is None:
self.assertIsNone(info.created, filename)
else:
self.assertEqual(info.created.year, int(created[:4]), filename)
self.assertEqual(info.created.month, int(created[4:6]), filename)
self.assertEqual(info.created.day, int(created[6:8]), filename)
# Correspondent
if correspondent:
self.assertEqual(info.correspondent.name, correspondent, filename)
else:
2016-03-24 19:18:33 +00:00
self.assertEqual(info.correspondent, None, filename)
# Title
self.assertEqual(info.title, title, filename)
# Tags
if tags is None:
2016-03-24 19:18:33 +00:00
self.assertEqual(info.tags, (), filename)
else:
2022-02-27 15:26:41 +01:00
self.assertEqual([t.name for t in info.tags], tags.split(","), filename)
2016-03-24 19:18:33 +00:00
def test_just_title(self):
2022-02-27 15:26:41 +01:00
template = "{title}.pdf"
for title in self.valid_titles:
spec = dict(title=title)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
2016-03-24 19:18:33 +00:00
def test_created_and_title(self):
template = "{created} - {title}.pdf"
2016-03-24 19:18:33 +00:00
for created in self.valid_dates:
for title in self.valid_titles:
2022-02-27 15:26:41 +01:00
spec = {"created": created, "title": title}
self._test_guessed_attributes(template.format(**spec), **spec)
2016-03-24 19:18:33 +00:00
2018-04-22 16:27:43 +01:00
def test_invalid_date_format(self):
info = FileInfo.from_filename("06112017Z - title.pdf")
2018-04-22 16:27:43 +01:00
self.assertEqual(info.title, "title")
self.assertIsNone(info.created)
def test_filename_parse_transforms(self):
filename = "tag1,tag2_20190908_180610_0001.pdf"
all_patt = re.compile("^.*$")
none_patt = re.compile("$a")
re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
# No transformations configured (= default)
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
self.assertEqual(info.tags, ())
self.assertIsNone(info.created)
# Pattern doesn't match (filename unaltered)
2022-02-27 15:26:41 +01:00
with self.settings(FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
# Simple transformation (match all)
2022-02-27 15:26:41 +01:00
with self.settings(FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "all")
# Multiple transformations configured (first pattern matches)
with self.settings(
2022-02-27 15:26:41 +01:00
FILENAME_PARSE_TRANSFORMS=[
(all_patt, "all.gif"),
(all_patt, "anotherall.gif"),
],
2022-02-27 15:26:41 +01:00
):
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "all")
# Multiple transformations configured (second pattern matches)
with self.settings(
2022-02-27 15:26:41 +01:00
FILENAME_PARSE_TRANSFORMS=[
(none_patt, "none.gif"),
(all_patt, "anotherall.gif"),
],
2022-02-27 15:26:41 +01:00
):
info = FileInfo.from_filename(filename)
self.assertEqual(info.title, "anotherall")
2020-11-16 23:16:37 +01:00
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
class _BaseTestParser(DocumentParser):
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None
class DummyParser(_BaseTestParser):
2020-11-29 19:22:49 +01:00
def __init__(self, logging_group, scratch_dir, archive_path):
super().__init__(logging_group, None)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
2020-11-29 19:22:49 +01:00
self.archive_path = archive_path
2020-11-16 23:16:37 +01:00
def get_thumbnail(self, document_path, mime_type, file_name=None):
2020-11-16 23:16:37 +01:00
return self.fake_thumb
2021-01-01 22:19:43 +01:00
def parse(self, document_path, mime_type, file_name=None):
2020-11-25 19:51:09 +01:00
self.text = "The Text"
2020-11-16 23:16:37 +01:00
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
class CopyParser(_BaseTestParser):
def get_thumbnail(self, document_path, mime_type, file_name=None):
2021-02-07 18:23:54 +01:00
return self.fake_thumb
def __init__(self, logging_group, progress_callback=None):
super().__init__(logging_group, progress_callback)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=self.tempdir)
2021-02-07 18:23:54 +01:00
def parse(self, document_path, mime_type, file_name=None):
self.text = "The text"
self.archive_path = os.path.join(self.tempdir, "archive.pdf")
shutil.copy(document_path, self.archive_path)
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
class FaultyParser(_BaseTestParser):
2020-11-25 19:51:09 +01:00
def __init__(self, logging_group, scratch_dir):
super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
2020-11-16 23:16:37 +01:00
def get_thumbnail(self, document_path, mime_type, file_name=None):
2020-11-16 23:16:37 +01:00
return self.fake_thumb
2021-01-01 22:19:43 +01:00
def parse(self, document_path, mime_type, file_name=None):
2020-11-16 23:16:37 +01:00
raise ParseError("Does not compute.")
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
class FaultyGenericExceptionParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir):
super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
def parse(self, document_path, mime_type, file_name=None):
raise Exception("Generic exception.")
2020-11-20 13:31:03 +01:00
def fake_magic_from_file(file, mime=False):
if mime:
if file.name.startswith("invalid_pdf"):
return "application/octet-stream"
2020-11-20 13:31:03 +01:00
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
2021-02-07 18:23:54 +01:00
elif os.path.splitext(file)[1] == ".png":
return "image/png"
elif os.path.splitext(file)[1] == ".webp":
return "image/webp"
2020-11-20 13:31:03 +01:00
else:
return "unknown"
else:
return "A verbose string that describes the contents of the file"
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumer(
DirectoriesMixin,
FileSystemAssertsMixin,
GetConsumerMixin,
TestCase,
):
2022-02-27 15:26:41 +01:00
def _assert_first_last_send_progress(
self,
first_status=ProgressStatusOptions.STARTED,
last_status=ProgressStatusOptions.SUCCESS,
2022-02-27 15:26:41 +01:00
first_progress=0,
first_progress_max=100,
last_progress=100,
last_progress_max=100,
):
self.assertGreaterEqual(len(self.status.payloads), 2)
2021-01-28 19:28:48 +01:00
payload = self.status.payloads[0]
self.assertEqual(payload["data"]["current_progress"], first_progress)
self.assertEqual(payload["data"]["max_progress"], first_progress_max)
self.assertEqual(payload["data"]["status"], first_status)
2021-01-28 19:28:48 +01:00
payload = self.status.payloads[-1]
self.assertEqual(payload["data"]["current_progress"], last_progress)
self.assertEqual(payload["data"]["max_progress"], last_progress_max)
self.assertEqual(payload["data"]["status"], last_status)
2021-01-28 19:28:48 +01:00
2021-01-26 15:19:56 +01:00
def make_dummy_parser(self, logging_group, progress_callback=None):
2022-02-27 15:26:41 +01:00
return DummyParser(
logging_group,
self.dirs.scratch_dir,
self.get_test_archive_file(),
2022-02-27 15:26:41 +01:00
)
2020-11-16 23:16:37 +01:00
2021-01-26 15:19:56 +01:00
def make_faulty_parser(self, logging_group, progress_callback=None):
return FaultyParser(logging_group, self.dirs.scratch_dir)
2020-11-16 23:16:37 +01:00
def make_faulty_generic_exception_parser(
self,
logging_group,
progress_callback=None,
):
return FaultyGenericExceptionParser(logging_group, self.dirs.scratch_dir)
2020-11-16 23:16:37 +01:00
def setUp(self):
super().setUp()
2020-11-16 23:16:37 +01:00
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
m = patcher.start()
2022-02-27 15:26:41 +01:00
m.return_value = [
(
None,
{
"parser": self.make_dummy_parser,
"mime_types": {"application/pdf": ".pdf"},
"weight": 0,
},
),
2022-02-27 15:26:41 +01:00
]
2021-01-26 15:26:25 +01:00
self.addCleanup(patcher.stop)
2020-11-16 23:16:37 +01:00
def get_test_file(self):
src = (
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000001.pdf"
2022-02-27 15:26:41 +01:00
)
dst = self.dirs.scratch_dir / "sample.pdf"
2020-11-29 19:22:49 +01:00
shutil.copy(src, dst)
return dst
def get_test_file2(self):
src = (
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000002.pdf"
)
dst = self.dirs.scratch_dir / "sample2.pdf"
shutil.copy(src, dst)
return dst
2020-11-29 19:22:49 +01:00
def get_test_archive_file(self):
src = (
Path(__file__).parent / "samples" / "documents" / "archive" / "0000001.pdf"
2022-02-27 15:26:41 +01:00
)
dst = self.dirs.scratch_dir / "sample_archive.pdf"
2020-11-29 19:22:49 +01:00
shutil.copy(src, dst)
return dst
2020-11-16 23:16:37 +01:00
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
@override_settings(FILENAME_FORMAT=None, TIME_ZONE="America/Chicago")
2020-11-16 23:16:37 +01:00
def testNormalOperation(self):
filename = self.get_test_file()
# Get the local time, as an aware datetime
# Roughly equal to file modification time
rough_create_date_local = timezone.localtime(timezone.now())
with self.get_consumer(filename) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
2020-11-16 23:16:37 +01:00
self.assertEqual(document.content, "The Text")
2022-02-27 15:26:41 +01:00
self.assertEqual(
document.title,
os.path.splitext(os.path.basename(filename))[0],
2022-02-27 15:26:41 +01:00
)
2020-11-16 23:16:37 +01:00
self.assertIsNone(document.correspondent)
self.assertIsNone(document.document_type)
self.assertEqual(document.filename, "0000001.pdf")
self.assertEqual(document.archive_filename, "0000001.pdf")
2020-11-16 23:16:37 +01:00
self.assertIsFile(document.source_path)
2020-11-16 23:16:37 +01:00
self.assertIsFile(document.thumbnail_path)
2020-11-16 23:16:37 +01:00
self.assertIsFile(document.archive_path)
2020-11-29 19:22:49 +01:00
self.assertEqual(document.checksum, "42995833e01aea9b3edee44bbfdd7ce1")
self.assertEqual(document.archive_checksum, "62acb0bcbfbcaa62ca6ad3668e4e404b")
self.assertIsNotFile(filename)
2020-11-16 23:16:37 +01:00
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
# Convert UTC time from DB to local time
document_date_local = timezone.localtime(document.created)
self.assertEqual(
document_date_local.tzinfo,
zoneinfo.ZoneInfo("America/Chicago"),
)
self.assertEqual(document_date_local.tzinfo, rough_create_date_local.tzinfo)
self.assertEqual(document_date_local.year, rough_create_date_local.year)
self.assertEqual(document_date_local.month, rough_create_date_local.month)
self.assertEqual(document_date_local.day, rough_create_date_local.day)
self.assertEqual(document_date_local.hour, rough_create_date_local.hour)
self.assertEqual(document_date_local.minute, rough_create_date_local.minute)
# Skipping seconds and more precise
2022-03-10 13:27:40 +01:00
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
@override_settings(FILENAME_FORMAT=None)
2021-05-19 20:02:47 +02:00
def testDeleteMacFiles(self):
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
filename = self.get_test_file()
2021-05-19 20:26:12 +02:00
shadow_file = os.path.join(self.dirs.scratch_dir, "._sample.pdf")
2021-05-19 20:02:47 +02:00
2021-05-19 20:26:12 +02:00
shutil.copy(filename, shadow_file)
self.assertIsFile(shadow_file)
2021-05-19 20:02:47 +02:00
with self.get_consumer(filename) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
2021-05-19 20:02:47 +02:00
self.assertIsFile(document.source_path)
2021-05-19 20:02:47 +02:00
self.assertIsNotFile(shadow_file)
self.assertIsNotFile(filename)
2021-05-19 20:02:47 +02:00
2020-11-16 23:16:37 +01:00
def testOverrideFilename(self):
filename = self.get_test_file()
override_filename = "Statement for November.pdf"
2020-11-16 23:16:37 +01:00
with self.get_consumer(
filename,
DocumentMetadataOverrides(filename=override_filename),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
2020-11-16 23:16:37 +01:00
self.assertEqual(document.title, "Statement for November")
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
2020-11-16 23:16:37 +01:00
def testOverrideTitle(self):
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(title="Override Title"),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
2020-11-16 23:16:37 +01:00
self.assertEqual(document.title, "Override Title")
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
2020-11-16 23:16:37 +01:00
def testOverrideTitleInvalidPlaceholders(self):
with self.assertLogs("paperless.consumer", level="ERROR") as cm:
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(title="Override {correspondent]"),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
self.assertEqual(document.title, "sample")
expected_str = "Error occurred parsing title override 'Override {correspondent]', falling back to original"
self.assertIn(expected_str, cm.output[0])
2020-11-16 23:16:37 +01:00
def testOverrideCorrespondent(self):
c = Correspondent.objects.create(name="test")
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(correspondent_id=c.pk),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertIsNotNone(document)
2020-11-16 23:16:37 +01:00
self.assertEqual(document.correspondent.id, c.id)
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
2020-11-16 23:16:37 +01:00
def testOverrideDocumentType(self):
dt = DocumentType.objects.create(name="test")
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(document_type_id=dt.pk),
) as consumer:
consumer.run()
document = Document.objects.first()
2020-11-16 23:16:37 +01:00
self.assertEqual(document.document_type.id, dt.id)
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
2020-11-16 23:16:37 +01:00
def testOverrideStoragePath(self):
sp = StoragePath.objects.create(name="test")
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(storage_path_id=sp.pk),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertEqual(document.storage_path.id, sp.id)
self._assert_first_last_send_progress()
2020-11-16 23:16:37 +01:00
def testOverrideTags(self):
t1 = Tag.objects.create(name="t1")
t2 = Tag.objects.create(name="t2")
t3 = Tag.objects.create(name="t3")
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(tag_ids=[t1.id, t3.id]),
) as consumer:
consumer.run()
document = Document.objects.first()
2020-11-16 23:16:37 +01:00
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())
self.assertIn(t3, document.tags.all())
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
2020-11-16 23:16:37 +01:00
def testOverrideCustomFields(self):
cf1 = CustomField.objects.create(name="Custom Field 1", data_type="string")
cf2 = CustomField.objects.create(
name="Custom Field 2",
data_type="integer",
)
cf3 = CustomField.objects.create(
name="Custom Field 3",
data_type="url",
)
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(custom_field_ids=[cf1.id, cf3.id]),
) as consumer:
consumer.run()
document = Document.objects.first()
fields_used = [
field_instance.field for field_instance in document.custom_fields.all()
]
self.assertIn(cf1, fields_used)
self.assertNotIn(cf2, fields_used)
self.assertIn(cf3, fields_used)
self._assert_first_last_send_progress()
def testOverrideAsn(self):
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(asn=123),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertEqual(document.archive_serial_number, 123)
self._assert_first_last_send_progress()
def testOverrideTitlePlaceholders(self):
c = Correspondent.objects.create(name="Correspondent Name")
dt = DocumentType.objects.create(name="DocType Name")
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(
correspondent_id=c.pk,
document_type_id=dt.pk,
title="{correspondent}{document_type} {added_month}-{added_year_short}",
),
) as consumer:
consumer.run()
document = Document.objects.first()
now = timezone.now()
self.assertEqual(document.title, f"{c.name}{dt.name} {now.strftime('%m-%y')}")
self._assert_first_last_send_progress()
def testOverrideOwner(self):
testuser = User.objects.create(username="testuser")
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(owner_id=testuser.pk),
) as consumer:
consumer.run()
document = Document.objects.first()
self.assertEqual(document.owner, testuser)
self._assert_first_last_send_progress()
def testOverridePermissions(self):
testuser = User.objects.create(username="testuser")
testgroup = Group.objects.create(name="testgroup")
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(
view_users=[testuser.pk],
view_groups=[testgroup.pk],
),
) as consumer:
consumer.run()
document = Document.objects.first()
user_checker = ObjectPermissionChecker(testuser)
self.assertTrue(user_checker.has_perm("view_document", document))
group_checker = ObjectPermissionChecker(testgroup)
self.assertTrue(group_checker.has_perm("view_document", document))
self._assert_first_last_send_progress()
2020-11-16 23:16:37 +01:00
def testNotAFile(self):
with self.get_consumer(Path("non-existing-file")) as consumer:
with self.assertRaisesMessage(ConsumerError, "File not found"):
consumer.run()
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress(last_status="FAILED")
2020-11-16 23:16:37 +01:00
2020-11-29 19:22:49 +01:00
def testDuplicates1(self):
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
2020-11-16 23:16:37 +01:00
with self.get_consumer(self.get_test_file()) as consumer:
with self.assertRaisesMessage(ConsumerError, "It is a duplicate"):
consumer.run()
2020-11-16 23:16:37 +01:00
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress(last_status="FAILED")
2020-11-16 23:16:37 +01:00
2020-11-29 19:22:49 +01:00
def testDuplicates2(self):
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
2020-11-29 19:22:49 +01:00
with self.get_consumer(self.get_test_archive_file()) as consumer:
with self.assertRaisesMessage(ConsumerError, "It is a duplicate"):
consumer.run()
2020-11-29 19:22:49 +01:00
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress(last_status="FAILED")
2020-11-29 19:22:49 +01:00
def testDuplicates3(self):
with self.get_consumer(self.get_test_archive_file()) as consumer:
consumer.run()
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
2020-11-29 19:22:49 +01:00
def testDuplicateInTrash(self):
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
Document.objects.all().delete()
with self.get_consumer(self.get_test_file()) as consumer:
with self.assertRaisesMessage(ConsumerError, "document is in the trash"):
consumer.run()
def testAsnExists(self):
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(asn=123),
) as consumer:
consumer.run()
with self.get_consumer(
self.get_test_file2(),
DocumentMetadataOverrides(asn=123),
) as consumer:
with self.assertRaisesMessage(ConsumerError, "ASN 123 already exists"):
consumer.run()
def testAsnExistsInTrash(self):
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(asn=123),
) as consumer:
consumer.run()
document = Document.objects.first()
document.delete()
with self.get_consumer(
self.get_test_file2(),
DocumentMetadataOverrides(asn=123),
) as consumer:
with self.assertRaisesMessage(ConsumerError, "document is in the trash"):
consumer.run()
2020-11-16 23:16:37 +01:00
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testNoParsers(self, m):
m.return_value = []
with self.get_consumer(self.get_test_file()) as consumer:
with self.assertRaisesMessage(
ConsumerError,
"sample.pdf: Unsupported mime type application/pdf",
):
consumer.run()
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress(last_status="FAILED")
2020-11-16 23:16:37 +01:00
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testFaultyParser(self, m):
2022-02-27 15:26:41 +01:00
m.return_value = [
(
None,
{
"parser": self.make_faulty_parser,
"mime_types": {"application/pdf": ".pdf"},
"weight": 0,
},
),
2022-02-27 15:26:41 +01:00
]
2020-11-16 23:16:37 +01:00
with self.get_consumer(self.get_test_file()) as consumer:
with self.assertRaisesMessage(
ConsumerError,
"sample.pdf: Error occurred while consuming document sample.pdf: Does not compute.",
):
consumer.run()
self._assert_first_last_send_progress(last_status="FAILED")
@mock.patch("documents.parsers.document_consumer_declaration.send")
def testGenericParserException(self, m):
m.return_value = [
(
None,
{
"parser": self.make_faulty_generic_exception_parser,
"mime_types": {"application/pdf": ".pdf"},
"weight": 0,
},
),
]
with self.get_consumer(self.get_test_file()) as consumer:
with self.assertRaisesMessage(
ConsumerError,
"sample.pdf: Unexpected error while consuming document sample.pdf: Generic exception.",
):
consumer.run()
2020-11-16 23:16:37 +01:00
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress(last_status="FAILED")
2020-11-16 23:16:37 +01:00
@mock.patch("documents.consumer.ConsumerPlugin._write")
2020-11-16 23:16:37 +01:00
def testPostSaveError(self, m):
filename = self.get_test_file()
m.side_effect = OSError("NO.")
2021-01-28 19:28:48 +01:00
with self.get_consumer(self.get_test_file()) as consumer:
with self.assertRaisesMessage(
ConsumerError,
"sample.pdf: The following error occurred while storing document sample.pdf after parsing: NO.",
):
consumer.run()
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress(last_status="FAILED")
2020-11-16 23:16:37 +01:00
# file not deleted
self.assertIsFile(filename)
2020-11-16 23:16:37 +01:00
# Database empty
self.assertEqual(Document.objects.all().count(), 0)
2020-11-16 23:16:37 +01:00
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
2020-11-16 23:16:37 +01:00
def testFilenameHandling(self):
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(title="new docs"),
) as consumer:
consumer.run()
document = Document.objects.first()
2020-11-16 23:16:37 +01:00
self.assertEqual(document.title, "new docs")
self.assertEqual(document.filename, "none/new docs.pdf")
self.assertEqual(document.archive_filename, "none/new docs.pdf")
2020-11-16 23:16:37 +01:00
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
@override_settings(FILENAME_FORMAT="{correspondent}/{title}")
2020-12-08 13:54:35 +01:00
@mock.patch("documents.signals.handlers.generate_unique_filename")
2020-11-29 15:47:56 +01:00
def testFilenameHandlingUnstableFormat(self, m):
filenames = ["this", "that", "now this", "i cannot decide"]
2020-11-29 15:47:56 +01:00
def get_filename():
f = filenames.pop()
filenames.insert(0, f)
return f
2022-02-27 15:26:41 +01:00
m.side_effect = lambda f, archive_filename=False: get_filename()
2020-11-29 15:47:56 +01:00
Tag.objects.create(name="test", is_inbox_tag=True)
with self.get_consumer(
self.get_test_file(),
DocumentMetadataOverrides(title="new docs"),
) as consumer:
consumer.run()
document = Document.objects.first()
2020-11-29 15:47:56 +01:00
self.assertEqual(document.title, "new docs")
self.assertIsNotNone(document.title)
self.assertIsFile(document.source_path)
self.assertIsFile(document.archive_path)
2020-11-29 15:47:56 +01:00
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
@mock.patch("documents.consumer.load_classifier")
2020-11-16 23:16:37 +01:00
def testClassifyDocument(self, m):
correspondent = Correspondent.objects.create(
name="test",
matching_algorithm=Correspondent.MATCH_AUTO,
)
dtype = DocumentType.objects.create(
name="test",
matching_algorithm=DocumentType.MATCH_AUTO,
)
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO)
t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO)
2020-11-16 23:16:37 +01:00
m.return_value = MagicMock()
m.return_value.predict_correspondent.return_value = correspondent.pk
m.return_value.predict_document_type.return_value = dtype.pk
m.return_value.predict_tags.return_value = [t1.pk]
with self.get_consumer(self.get_test_file()) as consumer:
consumer.run()
document = Document.objects.first()
2020-11-16 23:16:37 +01:00
self.assertEqual(document.correspondent, correspondent)
self.assertEqual(document.document_type, dtype)
self.assertIn(t1, document.tags.all())
self.assertNotIn(t2, document.tags.all())
2020-12-20 00:06:33 +01:00
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
2020-12-20 00:06:33 +01:00
@override_settings(CONSUMER_DELETE_DUPLICATES=True)
def test_delete_duplicate(self):
dst = self.get_test_file()
self.assertIsFile(dst)
with self.get_consumer(dst) as consumer:
consumer.run()
document = Document.objects.first()
2020-12-20 00:06:33 +01:00
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress()
self.assertIsNotFile(dst)
self.assertIsNotNone(document)
2021-01-28 19:28:48 +01:00
2020-12-20 00:06:33 +01:00
dst = self.get_test_file()
self.assertIsFile(dst)
with self.get_consumer(dst) as consumer:
with self.assertRaises(ConsumerError):
consumer.run()
self.assertIsNotFile(dst)
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress(last_status="FAILED")
2020-12-20 00:06:33 +01:00
@override_settings(CONSUMER_DELETE_DUPLICATES=False)
def test_no_delete_duplicate(self):
dst = self.get_test_file()
self.assertIsFile(dst)
with self.get_consumer(dst) as consumer:
consumer.run()
document = Document.objects.first()
self._assert_first_last_send_progress()
2020-12-20 00:06:33 +01:00
self.assertIsNotFile(dst)
self.assertIsNotNone(document)
2020-12-20 00:06:33 +01:00
dst = self.get_test_file()
self.assertIsFile(dst)
2021-01-01 23:27:55 +01:00
with self.get_consumer(dst) as consumer:
with self.assertRaisesRegex(
ConsumerError,
r"sample\.pdf: Not consuming sample\.pdf: It is a duplicate of sample \(#\d+\)",
):
consumer.run()
self.assertIsFile(dst)
2021-01-28 19:28:48 +01:00
self._assert_first_last_send_progress(last_status="FAILED")
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
@override_settings(FILENAME_FORMAT="{title}")
2021-02-07 18:23:54 +01:00
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test_similar_filenames(self, m):
2022-02-27 15:26:41 +01:00
shutil.copy(
Path(__file__).parent / "samples" / "simple.pdf",
settings.CONSUMPTION_DIR / "simple.pdf",
2022-02-27 15:26:41 +01:00
)
shutil.copy(
Path(__file__).parent / "samples" / "simple.png",
settings.CONSUMPTION_DIR / "simple.png",
2022-02-27 15:26:41 +01:00
)
shutil.copy(
Path(__file__).parent / "samples" / "simple-noalpha.png",
settings.CONSUMPTION_DIR / "simple.png.pdf",
2022-02-27 15:26:41 +01:00
)
m.return_value = [
(
None,
{
"parser": CopyParser,
"mime_types": {"application/pdf": ".pdf", "image/png": ".png"},
"weight": 0,
},
),
2022-02-27 15:26:41 +01:00
]
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png") as consumer:
consumer.run()
doc1 = Document.objects.filter(pk=1).first()
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.pdf") as consumer:
consumer.run()
doc2 = Document.objects.filter(pk=2).first()
with self.get_consumer(settings.CONSUMPTION_DIR / "simple.png.pdf") as consumer:
consumer.run()
doc3 = Document.objects.filter(pk=3).first()
2021-02-07 18:23:54 +01:00
self.assertEqual(doc1.filename, "simple.png")
self.assertEqual(doc1.archive_filename, "simple.pdf")
self.assertEqual(doc2.filename, "simple.pdf")
self.assertEqual(doc2.archive_filename, "simple_01.pdf")
self.assertEqual(doc3.filename, "simple.png.pdf")
self.assertEqual(doc3.archive_filename, "simple.png.pdf")
2021-02-07 18:23:54 +01:00
sanity_check()
2021-01-01 23:27:55 +01:00
@mock.patch("documents.consumer.run_subprocess")
def test_try_to_clean_invalid_pdf(self, m):
shutil.copy(
Path(__file__).parent / "samples" / "invalid_pdf.pdf",
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
)
with self.get_consumer(
settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
) as consumer:
# fails because no qpdf
self.assertRaises(ConsumerError, consumer.run)
m.assert_called_once()
args, _ = m.call_args
command = args[0]
self.assertEqual(command[0], "qpdf")
self.assertEqual(command[1], "--replace-input")
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
def setUp(self):
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
super().setUp()
def test_consume_date_from_content(self):
"""
GIVEN:
- File content with date in DMY (default) format
THEN:
- Should parse the date from the file content
"""
src = (
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000005.pdf"
)
dst = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(src, dst)
with self.get_consumer(dst) as consumer:
consumer.run()
document = Document.objects.first()
self.assertEqual(
document.created,
datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_consume_date_from_filename(self):
"""
GIVEN:
- File content with date in DMY (default) format
- Filename with date in YMD format
THEN:
- Should parse the date from the filename
"""
src = (
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000005.pdf"
)
dst = self.dirs.scratch_dir / "Scan - 2022-02-01.pdf"
shutil.copy(src, dst)
with self.get_consumer(dst) as consumer:
consumer.run()
document = Document.objects.first()
self.assertEqual(
document.created,
datetime.datetime(2022, 2, 1, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_consume_date_filename_date_use_content(self):
"""
GIVEN:
- File content with date in DMY (default) format
- Filename date parsing disabled
- Filename with date in YMD format
THEN:
- Should parse the date from the content
"""
src = (
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000005.pdf"
)
dst = self.dirs.scratch_dir / "Scan - 2022-02-01.pdf"
shutil.copy(src, dst)
with self.get_consumer(dst) as consumer:
consumer.run()
document = Document.objects.first()
self.assertEqual(
document.created,
datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(
IGNORE_DATES=(datetime.date(2010, 12, 13), datetime.date(2011, 11, 12)),
)
def test_consume_date_use_content_with_ignore(self):
"""
GIVEN:
- File content with dates in DMY (default) format
- File content includes ignored dates
THEN:
- Should parse the date from the filename
"""
src = (
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000006.pdf"
)
dst = self.dirs.scratch_dir / "0000006.pdf"
shutil.copy(src, dst)
with self.get_consumer(dst) as consumer:
consumer.run()
document = Document.objects.first()
self.assertEqual(
document.created,
datetime.datetime(1997, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
class PreConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
def setUp(self) -> None:
super().setUp()
src = (
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000005.pdf"
)
self.test_file = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(src, self.test_file)
@mock.patch("documents.consumer.run_subprocess")
2021-01-06 14:08:44 +01:00
@override_settings(PRE_CONSUME_SCRIPT=None)
def test_no_pre_consume_script(self, m):
with self.get_consumer(self.test_file) as c:
c.run()
m.assert_not_called()
2021-01-06 14:08:44 +01:00
@mock.patch("documents.consumer.run_subprocess")
2021-01-06 14:08:44 +01:00
@override_settings(PRE_CONSUME_SCRIPT="does-not-exist")
def test_pre_consume_script_not_found(self, m):
with self.get_consumer(self.test_file) as c:
self.assertRaises(ConsumerError, c.run)
m.assert_not_called()
2021-01-06 14:08:44 +01:00
@mock.patch("documents.consumer.run_subprocess")
2021-01-06 14:08:44 +01:00
def test_pre_consume_script(self, m):
with tempfile.NamedTemporaryFile() as script:
with override_settings(PRE_CONSUME_SCRIPT=script.name):
with self.get_consumer(self.test_file) as c:
c.run()
2021-01-06 14:08:44 +01:00
m.assert_called_once()
2021-01-06 14:08:44 +01:00
args, _ = m.call_args
2021-01-06 14:08:44 +01:00
command = args[0]
environment = args[1]
2021-01-06 14:08:44 +01:00
self.assertEqual(command[0], script.name)
self.assertEqual(command[1], str(self.test_file))
2021-01-06 14:08:44 +01:00
subset = {
"DOCUMENT_SOURCE_PATH": str(c.input_doc.original_file),
"DOCUMENT_WORKING_PATH": str(c.working_copy),
"TASK_ID": c.task_id,
}
self.assertDictEqual(environment, {**environment, **subset})
def test_script_with_output(self):
"""
GIVEN:
- A script which outputs to stdout and stderr
WHEN:
- The script is executed as a consume script
THEN:
- The script's outputs are logged
"""
with tempfile.NamedTemporaryFile(mode="w") as script:
# Write up a little script
with script.file as outfile:
outfile.write("#!/usr/bin/env bash\n")
outfile.write("echo This message goes to stdout\n")
outfile.write("echo This message goes to stderr >&2")
# Make the file executable
st = os.stat(script.name)
os.chmod(script.name, st.st_mode | stat.S_IEXEC)
with override_settings(PRE_CONSUME_SCRIPT=script.name):
with self.assertLogs("paperless.consumer", level="INFO") as cm:
with self.get_consumer(self.test_file) as c:
c.run()
self.assertIn(
"INFO:paperless.consumer:This message goes to stdout",
cm.output,
)
self.assertIn(
"WARNING:paperless.consumer:This message goes to stderr",
cm.output,
)
def test_script_exit_non_zero(self):
"""
GIVEN:
- A script which exits with a non-zero exit code
WHEN:
- The script is executed as a pre-consume script
THEN:
- A ConsumerError is raised
"""
with tempfile.NamedTemporaryFile(mode="w") as script:
# Write up a little script
with script.file as outfile:
outfile.write("#!/usr/bin/env bash\n")
outfile.write("exit 100\n")
# Make the file executable
st = os.stat(script.name)
os.chmod(script.name, st.st_mode | stat.S_IEXEC)
with override_settings(PRE_CONSUME_SCRIPT=script.name):
with self.get_consumer(self.test_file) as c:
self.assertRaises(
ConsumerError,
c.run,
)
2021-01-06 14:08:44 +01:00
class PostConsumeTestCase(DirectoriesMixin, GetConsumerMixin, TestCase):
def setUp(self) -> None:
super().setUp()
src = (
Path(__file__).parent
/ "samples"
/ "documents"
/ "originals"
/ "0000005.pdf"
)
self.test_file = self.dirs.scratch_dir / "sample.pdf"
shutil.copy(src, self.test_file)
@mock.patch("documents.consumer.run_subprocess")
2021-01-01 23:27:55 +01:00
@override_settings(POST_CONSUME_SCRIPT=None)
def test_no_post_consume_script(self, m):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
with self.get_consumer(self.test_file) as consumer:
consumer.run_post_consume_script(doc)
2021-01-01 23:27:55 +01:00
m.assert_not_called()
2021-01-06 14:08:44 +01:00
@override_settings(POST_CONSUME_SCRIPT="does-not-exist")
def test_post_consume_script_not_found(self):
2021-01-06 14:08:44 +01:00
doc = Document.objects.create(title="Test", mime_type="application/pdf")
with self.get_consumer(self.test_file) as consumer:
with self.assertRaisesMessage(
ConsumerError,
"sample.pdf: Configured post-consume script does-not-exist does not exist",
):
consumer.run_post_consume_script(doc)
2021-01-06 14:08:44 +01:00
@mock.patch("documents.consumer.run_subprocess")
2021-01-01 23:27:55 +01:00
def test_post_consume_script_simple(self, m):
2021-01-06 14:08:44 +01:00
with tempfile.NamedTemporaryFile() as script:
with override_settings(POST_CONSUME_SCRIPT=script.name):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
2021-01-01 23:27:55 +01:00
with self.get_consumer(self.test_file) as consumer:
consumer.run_post_consume_script(doc)
2021-01-01 23:27:55 +01:00
2021-01-06 14:08:44 +01:00
m.assert_called_once()
2021-01-01 23:27:55 +01:00
@mock.patch("documents.consumer.run_subprocess")
2021-01-01 23:27:55 +01:00
def test_post_consume_script_with_correspondent(self, m):
2021-01-06 14:08:44 +01:00
with tempfile.NamedTemporaryFile() as script:
with override_settings(POST_CONSUME_SCRIPT=script.name):
c = Correspondent.objects.create(name="my_bank")
2022-02-27 15:26:41 +01:00
doc = Document.objects.create(
title="Test",
mime_type="application/pdf",
correspondent=c,
2022-02-27 15:26:41 +01:00
)
2021-01-06 14:08:44 +01:00
tag1 = Tag.objects.create(name="a")
tag2 = Tag.objects.create(name="b")
doc.tags.add(tag1)
doc.tags.add(tag2)
2021-01-01 23:27:55 +01:00
with self.get_consumer(self.test_file) as consumer:
consumer.run_post_consume_script(doc)
2021-01-01 23:27:55 +01:00
2021-01-06 14:08:44 +01:00
m.assert_called_once()
2021-01-01 23:27:55 +01:00
args, _ = m.call_args
2021-01-01 23:27:55 +01:00
command = args[0]
environment = args[1]
2021-01-01 23:27:55 +01:00
2021-01-06 14:08:44 +01:00
self.assertEqual(command[0], script.name)
self.assertEqual(command[1], str(doc.pk))
self.assertEqual(command[5], f"/api/documents/{doc.pk}/download/")
self.assertEqual(command[6], f"/api/documents/{doc.pk}/thumb/")
self.assertEqual(command[7], "my_bank")
self.assertCountEqual(command[8].split(","), ["a", "b"])
subset = {
"DOCUMENT_ID": str(doc.pk),
"DOCUMENT_DOWNLOAD_URL": f"/api/documents/{doc.pk}/download/",
"DOCUMENT_THUMBNAIL_URL": f"/api/documents/{doc.pk}/thumb/",
"DOCUMENT_CORRESPONDENT": "my_bank",
"DOCUMENT_TAGS": "a,b",
2023-08-21 13:33:34 +02:00
"TASK_ID": consumer.task_id,
}
self.assertDictEqual(environment, {**environment, **subset})
def test_script_exit_non_zero(self):
"""
GIVEN:
- A script which exits with a non-zero exit code
WHEN:
- The script is executed as a post-consume script
THEN:
- A ConsumerError is raised
"""
with tempfile.NamedTemporaryFile(mode="w") as script:
# Write up a little script
with script.file as outfile:
outfile.write("#!/usr/bin/env bash\n")
outfile.write("exit -500\n")
# Make the file executable
st = os.stat(script.name)
os.chmod(script.name, st.st_mode | stat.S_IEXEC)
with override_settings(POST_CONSUME_SCRIPT=script.name):
doc = Document.objects.create(title="Test", mime_type="application/pdf")
with self.get_consumer(self.test_file) as consumer:
with self.assertRaisesRegex(
ConsumerError,
r"sample\.pdf: Error while executing post-consume script: Command '\[.*\]' returned non-zero exit status \d+\.",
):
consumer.run_post_consume_script(doc)