2020-11-20 13:31:03 +01:00
|
|
|
import os
|
2020-11-25 21:38:19 +01:00
|
|
|
import shutil
|
|
|
|
|
import tempfile
|
2020-11-08 13:49:15 +01:00
|
|
|
from tempfile import TemporaryDirectory
|
|
|
|
|
from unittest import mock
|
|
|
|
|
|
2022-03-11 10:55:51 -08:00
|
|
|
from django.test import override_settings
|
|
|
|
|
from django.test import TestCase
|
|
|
|
|
from documents.parsers import DocumentParser
|
|
|
|
|
from documents.parsers import get_default_file_extension
|
|
|
|
|
from documents.parsers import get_parser_class
|
|
|
|
|
from documents.parsers import get_parser_class_for_mime_type
|
|
|
|
|
from documents.parsers import get_supported_file_extensions
|
|
|
|
|
from documents.parsers import is_file_ext_supported
|
2020-11-30 00:40:04 +01:00
|
|
|
from paperless_tesseract.parsers import RasterisedDocumentParser
|
|
|
|
|
from paperless_text.parsers import TextDocumentParser
|
2020-11-08 13:49:15 +01:00
|
|
|
|
2020-11-21 12:12:19 +01:00
|
|
|
|
2020-11-20 13:31:03 +01:00
|
|
|
def fake_magic_from_file(file, mime=False):
|
2020-11-08 13:49:15 +01:00
|
|
|
|
2020-11-20 13:31:03 +01:00
|
|
|
if mime:
|
|
|
|
|
if os.path.splitext(file)[1] == ".pdf":
|
|
|
|
|
return "application/pdf"
|
|
|
|
|
else:
|
|
|
|
|
return "unknown"
|
|
|
|
|
else:
|
|
|
|
|
return "A verbose string that describes the contents of the file"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@mock.patch("documents.parsers.magic.from_file", fake_magic_from_file)
|
2020-11-08 13:49:15 +01:00
|
|
|
class TestParserDiscovery(TestCase):
|
|
|
|
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
|
|
|
def test__get_parser_class_1_parser(self, m, *args):
|
2022-05-06 09:04:08 -07:00
|
|
|
class DummyParser:
|
2020-11-08 13:49:15 +01:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
m.return_value = (
|
2022-02-27 15:26:41 +01:00
|
|
|
(
|
|
|
|
|
None,
|
|
|
|
|
{
|
|
|
|
|
"weight": 0,
|
|
|
|
|
"parser": DummyParser,
|
|
|
|
|
"mime_types": {"application/pdf": ".pdf"},
|
|
|
|
|
},
|
|
|
|
|
),
|
2020-11-08 13:49:15 +01:00
|
|
|
)
|
|
|
|
|
|
2022-02-27 15:26:41 +01:00
|
|
|
self.assertEqual(get_parser_class("doc.pdf"), DummyParser)
|
2020-11-08 13:49:15 +01:00
|
|
|
|
|
|
|
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
|
|
|
def test__get_parser_class_n_parsers(self, m, *args):
|
2022-05-06 09:04:08 -07:00
|
|
|
class DummyParser1:
|
2020-11-08 13:49:15 +01:00
|
|
|
pass
|
|
|
|
|
|
2022-05-06 09:04:08 -07:00
|
|
|
class DummyParser2:
|
2020-11-08 13:49:15 +01:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
m.return_value = (
|
2022-02-27 15:26:41 +01:00
|
|
|
(
|
|
|
|
|
None,
|
|
|
|
|
{
|
|
|
|
|
"weight": 0,
|
|
|
|
|
"parser": DummyParser1,
|
|
|
|
|
"mime_types": {"application/pdf": ".pdf"},
|
|
|
|
|
},
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
None,
|
|
|
|
|
{
|
|
|
|
|
"weight": 1,
|
|
|
|
|
"parser": DummyParser2,
|
|
|
|
|
"mime_types": {"application/pdf": ".pdf"},
|
|
|
|
|
},
|
|
|
|
|
),
|
2020-11-08 13:49:15 +01:00
|
|
|
)
|
|
|
|
|
|
2022-02-27 15:26:41 +01:00
|
|
|
self.assertEqual(get_parser_class("doc.pdf"), DummyParser2)
|
2020-11-08 13:49:15 +01:00
|
|
|
|
|
|
|
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
|
|
|
def test__get_parser_class_0_parsers(self, m, *args):
|
2020-11-16 23:53:12 +01:00
|
|
|
m.return_value = []
|
2020-11-08 13:49:15 +01:00
|
|
|
with TemporaryDirectory() as tmpdir:
|
2022-02-27 15:26:41 +01:00
|
|
|
self.assertIsNone(get_parser_class("doc.pdf"))
|
2020-11-25 21:38:19 +01:00
|
|
|
|
|
|
|
|
|
2021-02-09 22:12:43 +01:00
|
|
|
def fake_get_thumbnail(self, path, mimetype, file_name):
|
2020-11-25 21:38:19 +01:00
|
|
|
return os.path.join(os.path.dirname(__file__), "examples", "no-text.png")
|
|
|
|
|
|
|
|
|
|
|
2020-11-30 00:40:04 +01:00
|
|
|
class TestParserAvailability(TestCase):
|
|
|
|
|
def test_file_extensions(self):
|
|
|
|
|
|
|
|
|
|
for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
|
|
|
|
|
self.assertIn(ext, get_supported_file_extensions())
|
2022-02-27 15:26:41 +01:00
|
|
|
self.assertEqual(get_default_file_extension("application/pdf"), ".pdf")
|
|
|
|
|
self.assertEqual(get_default_file_extension("image/png"), ".png")
|
|
|
|
|
self.assertEqual(get_default_file_extension("image/jpeg"), ".jpg")
|
|
|
|
|
self.assertEqual(get_default_file_extension("text/plain"), ".txt")
|
|
|
|
|
self.assertEqual(get_default_file_extension("text/csv"), ".csv")
|
|
|
|
|
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
|
|
|
|
|
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
|
|
|
|
|
|
|
|
|
|
self.assertIsInstance(
|
|
|
|
|
get_parser_class_for_mime_type("application/pdf")(logging_group=None),
|
|
|
|
|
RasterisedDocumentParser,
|
|
|
|
|
)
|
|
|
|
|
self.assertIsInstance(
|
|
|
|
|
get_parser_class_for_mime_type("text/plain")(logging_group=None),
|
|
|
|
|
TextDocumentParser,
|
|
|
|
|
)
|
|
|
|
|
self.assertEqual(get_parser_class_for_mime_type("text/sdgsdf"), None)
|
|
|
|
|
|
|
|
|
|
self.assertTrue(is_file_ext_supported(".pdf"))
|
|
|
|
|
self.assertFalse(is_file_ext_supported(".hsdfh"))
|
|
|
|
|
self.assertFalse(is_file_ext_supported(""))
|