paperless-ngx/src/paperless_text/tests/test_parser.py

53 lines
1.4 KiB
Python
Raw Normal View History

from pathlib import Path
2020-12-15 13:26:01 +01:00
from django.test import TestCase
2020-12-15 13:26:01 +01:00
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
2020-12-15 13:26:01 +01:00
from paperless_text.parsers import TextDocumentParser
class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_DIR = Path(__file__).resolve().parent / "samples"
2020-12-15 13:26:01 +01:00
def test_thumbnail(self):
parser = TextDocumentParser(None)
# just make sure that it does not crash
2022-02-27 15:26:41 +01:00
f = parser.get_thumbnail(
self.SAMPLE_DIR / "test.txt",
"text/plain",
2022-02-27 15:26:41 +01:00
)
self.assertIsFile(f)
2020-12-15 13:26:01 +01:00
def test_parse(self):
parser = TextDocumentParser(None)
2022-02-27 15:26:41 +01:00
parser.parse(
self.SAMPLE_DIR / "test.txt",
"text/plain",
2022-02-27 15:26:41 +01:00
)
2020-12-15 13:26:01 +01:00
self.assertEqual(parser.get_text(), "This is a test file.\n")
self.assertIsNone(parser.get_archive_path())
def test_parse_invalid_bytes(self):
"""
GIVEN:
- Text file which contains invalid UTF bytes
WHEN:
- The file is parsed
THEN:
- Parsing continues
- Invalid bytes are removed
"""
parser = TextDocumentParser(None)
parser.parse(
self.SAMPLE_DIR / "decode_error.txt",
"text/plain",
)
self.assertEqual(parser.get_text(), "Pantothens<EFBFBD>ure\n")
self.assertIsNone(parser.get_archive_path())