2023-05-12 14:21:32 -07:00
|
|
|
|
from pathlib import Path
|
2020-12-15 13:26:01 +01:00
|
|
|
|
|
|
|
|
|
|
from django.test import TestCase
|
2023-04-20 08:10:17 -07:00
|
|
|
|
|
2020-12-15 13:26:01 +01:00
|
|
|
|
from documents.tests.utils import DirectoriesMixin
|
2023-02-19 18:00:45 -08:00
|
|
|
|
from documents.tests.utils import FileSystemAssertsMixin
|
2020-12-15 13:26:01 +01:00
|
|
|
|
from paperless_text.parsers import TextDocumentParser
|
|
|
|
|
|
|
|
|
|
|
|
|
2023-02-19 18:00:45 -08:00
|
|
|
|
class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
2023-05-12 14:21:32 -07:00
|
|
|
|
SAMPLE_DIR = Path(__file__).resolve().parent / "samples"
|
|
|
|
|
|
|
2020-12-15 13:26:01 +01:00
|
|
|
|
def test_thumbnail(self):
|
|
|
|
|
|
parser = TextDocumentParser(None)
|
|
|
|
|
|
|
|
|
|
|
|
# just make sure that it does not crash
|
2022-02-27 15:26:41 +01:00
|
|
|
|
f = parser.get_thumbnail(
|
2023-05-12 14:21:32 -07:00
|
|
|
|
self.SAMPLE_DIR / "test.txt",
|
2022-03-11 10:55:51 -08:00
|
|
|
|
"text/plain",
|
2022-02-27 15:26:41 +01:00
|
|
|
|
)
|
2023-02-19 18:00:45 -08:00
|
|
|
|
self.assertIsFile(f)
|
2020-12-15 13:26:01 +01:00
|
|
|
|
|
|
|
|
|
|
def test_parse(self):
|
|
|
|
|
|
parser = TextDocumentParser(None)
|
|
|
|
|
|
|
2022-02-27 15:26:41 +01:00
|
|
|
|
parser.parse(
|
2023-05-12 14:21:32 -07:00
|
|
|
|
self.SAMPLE_DIR / "test.txt",
|
2022-03-11 10:55:51 -08:00
|
|
|
|
"text/plain",
|
2022-02-27 15:26:41 +01:00
|
|
|
|
)
|
2020-12-15 13:26:01 +01:00
|
|
|
|
|
|
|
|
|
|
self.assertEqual(parser.get_text(), "This is a test file.\n")
|
|
|
|
|
|
self.assertIsNone(parser.get_archive_path())
|
2023-05-12 14:21:32 -07:00
|
|
|
|
|
|
|
|
|
|
def test_parse_invalid_bytes(self):
|
|
|
|
|
|
"""
|
|
|
|
|
|
GIVEN:
|
|
|
|
|
|
- Text file which contains invalid UTF bytes
|
|
|
|
|
|
WHEN:
|
|
|
|
|
|
- The file is parsed
|
|
|
|
|
|
THEN:
|
|
|
|
|
|
- Parsing continues
|
|
|
|
|
|
- Invalid bytes are removed
|
|
|
|
|
|
"""
|
|
|
|
|
|
parser = TextDocumentParser(None)
|
|
|
|
|
|
|
|
|
|
|
|
parser.parse(
|
|
|
|
|
|
self.SAMPLE_DIR / "decode_error.txt",
|
|
|
|
|
|
"text/plain",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2023-05-13 07:47:21 -07:00
|
|
|
|
self.assertEqual(parser.get_text(), "Pantothens<EFBFBD>ure\n")
|
2023-05-12 14:21:32 -07:00
|
|
|
|
self.assertIsNone(parser.get_archive_path())
|