paperless-ngx/src/documents/tests/test_parsers.py

124 lines
4.4 KiB
Python
Raw Normal View History

2020-11-20 13:31:03 +01:00
import os
2020-11-25 21:38:19 +01:00
import shutil
import tempfile
2020-11-08 13:49:15 +01:00
from tempfile import TemporaryDirectory
from unittest import mock
2020-11-25 21:38:19 +01:00
from django.test import TestCase, override_settings
2020-11-08 13:49:15 +01:00
from documents.parsers import get_parser_class, get_supported_file_extensions, get_default_file_extension, \
2020-12-01 23:54:33 +01:00
get_parser_class_for_mime_type, DocumentParser, is_file_ext_supported
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_text.parsers import TextDocumentParser
2020-11-08 13:49:15 +01:00
2020-11-21 12:12:19 +01:00
2020-11-20 13:31:03 +01:00
def fake_magic_from_file(file, mime=False):
2020-11-08 13:49:15 +01:00
2020-11-20 13:31:03 +01:00
if mime:
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
else:
return "unknown"
else:
return "A verbose string that describes the contents of the file"
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
2020-11-08 13:49:15 +01:00
class TestParserDiscovery(TestCase):
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_1_parser(self, m, *args):
class DummyParser(object):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser, "mime_types": {"application/pdf": ".pdf"}}),
2020-11-08 13:49:15 +01:00
)
self.assertEqual(
get_parser_class("doc.pdf"),
DummyParser
)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_n_parsers(self, m, *args):
class DummyParser1(object):
pass
class DummyParser2(object):
pass
m.return_value = (
(None, {"weight": 0, "parser": DummyParser1, "mime_types": {"application/pdf": ".pdf"}}),
(None, {"weight": 1, "parser": DummyParser2, "mime_types": {"application/pdf": ".pdf"}}),
2020-11-08 13:49:15 +01:00
)
self.assertEqual(
get_parser_class("doc.pdf"),
DummyParser2
)
@mock.patch("documents.parsers.document_consumer_declaration.send")
def test__get_parser_class_0_parsers(self, m, *args):
m.return_value = []
2020-11-08 13:49:15 +01:00
with TemporaryDirectory() as tmpdir:
self.assertIsNone(
get_parser_class("doc.pdf")
)
2020-11-25 21:38:19 +01:00
def fake_get_thumbnail(self, path, mimetype, file_name):
2020-11-25 21:38:19 +01:00
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
class TestBaseParser(TestCase):
def setUp(self) -> None:
self.scratch = tempfile.mkdtemp()
override_settings(
SCRATCH_DIR=self.scratch
).enable()
def tearDown(self) -> None:
shutil.rmtree(self.scratch)
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
2020-11-29 19:22:49 +01:00
@override_settings(OPTIMIZE_THUMBNAILS=True)
2020-11-25 21:38:19 +01:00
def test_get_optimised_thumbnail(self):
parser = DocumentParser(None)
parser.get_optimised_thumbnail("any", "not important", "document.pdf")
2020-11-25 21:38:19 +01:00
@mock.patch("documents.parsers.DocumentParser.get_thumbnail", fake_get_thumbnail)
@override_settings(OPTIMIZE_THUMBNAILS=False)
def test_get_optimised_thumb_disabled(self):
parser = DocumentParser(None)
path = parser.get_optimised_thumbnail("any", "not important", "document.pdf")
self.assertEqual(path, fake_get_thumbnail(None, None, None, None))
2020-11-25 21:38:19 +01:00
class TestParserAvailability(TestCase):
def test_file_extensions(self):
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
self.assertIn(ext, get_supported_file_extensions())
self.assertEqual(get_default_file_extension('application/pdf'), ".pdf")
self.assertEqual(get_default_file_extension('image/png'), ".png")
self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg")
self.assertEqual(get_default_file_extension('text/plain'), ".txt")
self.assertEqual(get_default_file_extension('text/csv'), ".csv")
2020-12-02 22:44:18 +01:00
self.assertEqual(get_default_file_extension('application/zip'), ".zip")
self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "")
2021-02-04 13:17:24 +01:00
self.assertIsInstance(get_parser_class_for_mime_type('application/pdf')(logging_group=None), RasterisedDocumentParser)
self.assertIsInstance(get_parser_class_for_mime_type('text/plain')(logging_group=None), TextDocumentParser)
self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)
2020-12-01 23:54:33 +01:00
self.assertTrue(is_file_ext_supported('.pdf'))
self.assertFalse(is_file_ext_supported('.hsdfh'))
2021-01-20 12:34:01 +01:00
self.assertFalse(is_file_ext_supported(''))