2023-05-12 14:21:32 -07:00
|
|
|
|
from pathlib import Path
|
2020-12-15 13:26:01 +01:00
|
|
|
|
|
|
|
|
|
|
from paperless_text.parsers import TextDocumentParser
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
class TestTextParser:
|
|
|
|
|
|
def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
2020-12-15 13:26:01 +01:00
|
|
|
|
# just make sure that it does not crash
|
2024-07-08 07:46:20 -07:00
|
|
|
|
f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
|
|
|
|
|
|
assert f.exists()
|
|
|
|
|
|
assert f.is_file()
|
2020-12-15 13:26:01 +01:00
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
|
|
|
|
|
text_parser.parse(sample_txt_file, "text/plain")
|
2020-12-15 13:26:01 +01:00
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
assert text_parser.get_text() == "This is a test file.\n"
|
|
|
|
|
|
assert text_parser.get_archive_path() is None
|
2023-05-12 14:21:32 -07:00
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
def test_parse_invalid_bytes(
|
|
|
|
|
|
self,
|
|
|
|
|
|
text_parser: TextDocumentParser,
|
|
|
|
|
|
malformed_txt_file: Path,
|
|
|
|
|
|
):
|
2023-05-12 14:21:32 -07:00
|
|
|
|
"""
|
|
|
|
|
|
GIVEN:
|
|
|
|
|
|
- Text file which contains invalid UTF bytes
|
|
|
|
|
|
WHEN:
|
|
|
|
|
|
- The file is parsed
|
|
|
|
|
|
THEN:
|
|
|
|
|
|
- Parsing continues
|
|
|
|
|
|
- Invalid bytes are removed
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
text_parser.parse(malformed_txt_file, "text/plain")
|
2023-05-12 14:21:32 -07:00
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
|
|
|
|
|
|
assert text_parser.get_archive_path() is None
|