2020-11-08 13:49:15 +01:00
|
|
|
from tempfile import TemporaryDirectory
|
|
|
|
|
from unittest import mock
|
|
|
|
|
|
2022-03-11 10:55:51 -08:00
|
|
|
from django.test import TestCase
|
|
|
|
|
from documents.parsers import get_default_file_extension
|
|
|
|
|
from documents.parsers import get_parser_class_for_mime_type
|
|
|
|
|
from documents.parsers import get_supported_file_extensions
|
|
|
|
|
from documents.parsers import is_file_ext_supported
|
2020-11-30 00:40:04 +01:00
|
|
|
from paperless_tesseract.parsers import RasterisedDocumentParser
|
|
|
|
|
from paperless_text.parsers import TextDocumentParser
|
2020-11-08 13:49:15 +01:00
|
|
|
|
2020-11-21 12:12:19 +01:00
|
|
|
|
2020-11-08 13:49:15 +01:00
|
|
|
class TestParserDiscovery(TestCase):
|
|
|
|
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
2023-01-04 10:18:31 -08:00
|
|
|
def test_get_parser_class_1_parser(self, m, *args):
|
|
|
|
|
"""
|
|
|
|
|
GIVEN:
|
|
|
|
|
- Parser declared for a given mimetype
|
|
|
|
|
WHEN:
|
|
|
|
|
- Attempt to get parser for the mimetype
|
|
|
|
|
THEN:
|
|
|
|
|
- Declared parser class is returned
|
|
|
|
|
"""
|
|
|
|
|
|
2022-05-06 09:04:08 -07:00
|
|
|
class DummyParser:
|
2020-11-08 13:49:15 +01:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
m.return_value = (
|
2022-02-27 15:26:41 +01:00
|
|
|
(
|
|
|
|
|
None,
|
|
|
|
|
{
|
|
|
|
|
"weight": 0,
|
|
|
|
|
"parser": DummyParser,
|
|
|
|
|
"mime_types": {"application/pdf": ".pdf"},
|
|
|
|
|
},
|
|
|
|
|
),
|
2020-11-08 13:49:15 +01:00
|
|
|
)
|
|
|
|
|
|
2023-01-04 10:18:31 -08:00
|
|
|
self.assertEqual(get_parser_class_for_mime_type("application/pdf"), DummyParser)
|
2020-11-08 13:49:15 +01:00
|
|
|
|
|
|
|
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
2023-01-04 10:18:31 -08:00
|
|
|
def test_get_parser_class_n_parsers(self, m, *args):
|
|
|
|
|
"""
|
|
|
|
|
GIVEN:
|
|
|
|
|
- Two parsers declared for a given mimetype
|
|
|
|
|
- Second parser has a higher weight
|
|
|
|
|
WHEN:
|
|
|
|
|
- Attempt to get parser for the mimetype
|
|
|
|
|
THEN:
|
|
|
|
|
- Second parser class is returned
|
|
|
|
|
"""
|
|
|
|
|
|
2022-05-06 09:04:08 -07:00
|
|
|
class DummyParser1:
|
2020-11-08 13:49:15 +01:00
|
|
|
pass
|
|
|
|
|
|
2022-05-06 09:04:08 -07:00
|
|
|
class DummyParser2:
|
2020-11-08 13:49:15 +01:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
m.return_value = (
|
2022-02-27 15:26:41 +01:00
|
|
|
(
|
|
|
|
|
None,
|
|
|
|
|
{
|
|
|
|
|
"weight": 0,
|
|
|
|
|
"parser": DummyParser1,
|
|
|
|
|
"mime_types": {"application/pdf": ".pdf"},
|
|
|
|
|
},
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
None,
|
|
|
|
|
{
|
|
|
|
|
"weight": 1,
|
|
|
|
|
"parser": DummyParser2,
|
|
|
|
|
"mime_types": {"application/pdf": ".pdf"},
|
|
|
|
|
},
|
|
|
|
|
),
|
2020-11-08 13:49:15 +01:00
|
|
|
)
|
|
|
|
|
|
2023-01-04 10:18:31 -08:00
|
|
|
self.assertEqual(
|
|
|
|
|
get_parser_class_for_mime_type("application/pdf"),
|
|
|
|
|
DummyParser2,
|
|
|
|
|
)
|
2020-11-08 13:49:15 +01:00
|
|
|
|
|
|
|
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
2023-01-04 10:18:31 -08:00
|
|
|
def test_get_parser_class_0_parsers(self, m, *args):
|
|
|
|
|
"""
|
|
|
|
|
GIVEN:
|
|
|
|
|
- No parsers are declared
|
|
|
|
|
WHEN:
|
|
|
|
|
- Attempt to get parser for the mimetype
|
|
|
|
|
THEN:
|
|
|
|
|
- No parser class is returned
|
|
|
|
|
"""
|
2020-11-16 23:53:12 +01:00
|
|
|
m.return_value = []
|
2020-11-08 13:49:15 +01:00
|
|
|
with TemporaryDirectory() as tmpdir:
|
2023-01-04 10:18:31 -08:00
|
|
|
self.assertIsNone(get_parser_class_for_mime_type("application/pdf"))
|
2020-11-25 21:38:19 +01:00
|
|
|
|
2023-01-04 10:18:31 -08:00
|
|
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
|
|
|
def test_get_parser_class_no_valid_parser(self, m, *args):
|
|
|
|
|
"""
|
|
|
|
|
GIVEN:
|
|
|
|
|
- No parser declared for a given mimetype
|
|
|
|
|
- Parser declared for a different mimetype
|
|
|
|
|
WHEN:
|
|
|
|
|
- Attempt to get parser for the given mimetype
|
|
|
|
|
THEN:
|
|
|
|
|
- No parser class is returned
|
|
|
|
|
"""
|
2020-11-25 21:38:19 +01:00
|
|
|
|
2023-01-04 10:18:31 -08:00
|
|
|
class DummyParser:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
m.return_value = (
|
|
|
|
|
(
|
|
|
|
|
None,
|
|
|
|
|
{
|
|
|
|
|
"weight": 0,
|
|
|
|
|
"parser": DummyParser,
|
|
|
|
|
"mime_types": {"application/pdf": ".pdf"},
|
|
|
|
|
},
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
self.assertIsNone(get_parser_class_for_mime_type("image/tiff"))
|
2020-11-25 21:38:19 +01:00
|
|
|
|
|
|
|
|
|
2020-11-30 00:40:04 +01:00
|
|
|
class TestParserAvailability(TestCase):
|
|
|
|
|
def test_file_extensions(self):
|
|
|
|
|
|
2023-01-04 10:18:31 -08:00
|
|
|
supported_mimes_and_exts = [
|
|
|
|
|
("application/pdf", ".pdf"),
|
|
|
|
|
("image/png", ".png"),
|
|
|
|
|
("image/jpeg", ".jpg"),
|
|
|
|
|
("image/tiff", ".tif"),
|
|
|
|
|
("image/webp", ".webp"),
|
|
|
|
|
("text/plain", ".txt"),
|
|
|
|
|
("text/csv", ".csv"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
supported_exts = get_supported_file_extensions()
|
|
|
|
|
|
|
|
|
|
for mime_type, ext in supported_mimes_and_exts:
|
|
|
|
|
self.assertIn(ext, supported_exts)
|
|
|
|
|
self.assertEqual(get_default_file_extension(mime_type), ext)
|
|
|
|
|
|
|
|
|
|
# Test no parser declared still returns a an extension
|
2022-02-27 15:26:41 +01:00
|
|
|
self.assertEqual(get_default_file_extension("application/zip"), ".zip")
|
2023-01-04 10:18:31 -08:00
|
|
|
|
|
|
|
|
# Test invalid mimetype returns no extension
|
2022-02-27 15:26:41 +01:00
|
|
|
self.assertEqual(get_default_file_extension("aasdasd/dgfgf"), "")
|
|
|
|
|
|
|
|
|
|
self.assertIsInstance(
|
|
|
|
|
get_parser_class_for_mime_type("application/pdf")(logging_group=None),
|
|
|
|
|
RasterisedDocumentParser,
|
|
|
|
|
)
|
|
|
|
|
self.assertIsInstance(
|
|
|
|
|
get_parser_class_for_mime_type("text/plain")(logging_group=None),
|
|
|
|
|
TextDocumentParser,
|
|
|
|
|
)
|
2023-01-04 10:18:31 -08:00
|
|
|
self.assertIsNone(get_parser_class_for_mime_type("text/sdgsdf"))
|
2022-02-27 15:26:41 +01:00
|
|
|
|
|
|
|
|
self.assertTrue(is_file_ext_supported(".pdf"))
|
|
|
|
|
self.assertFalse(is_file_ext_supported(".hsdfh"))
|
|
|
|
|
self.assertFalse(is_file_ext_supported(""))
|