2025-08-01 13:26:35 -04:00
|
|
|
|
import tempfile
|
2023-05-12 14:21:32 -07:00
|
|
|
|
from pathlib import Path
|
2020-12-15 13:26:01 +01:00
|
|
|
|
|
|
|
|
|
|
from paperless_text.parsers import TextDocumentParser
|
|
|
|
|
|
|
|
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
class TestTextParser:
|
|
|
|
|
|
def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
2020-12-15 13:26:01 +01:00
|
|
|
|
# just make sure that it does not crash
|
2024-07-08 07:46:20 -07:00
|
|
|
|
f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
|
|
|
|
|
|
assert f.exists()
|
|
|
|
|
|
assert f.is_file()
|
2020-12-15 13:26:01 +01:00
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
|
|
|
|
|
|
text_parser.parse(sample_txt_file, "text/plain")
|
2020-12-15 13:26:01 +01:00
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
assert text_parser.get_text() == "This is a test file.\n"
|
|
|
|
|
|
assert text_parser.get_archive_path() is None
|
2023-05-12 14:21:32 -07:00
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
def test_parse_invalid_bytes(
|
|
|
|
|
|
self,
|
|
|
|
|
|
text_parser: TextDocumentParser,
|
|
|
|
|
|
malformed_txt_file: Path,
|
|
|
|
|
|
):
|
2023-05-12 14:21:32 -07:00
|
|
|
|
"""
|
|
|
|
|
|
GIVEN:
|
|
|
|
|
|
- Text file which contains invalid UTF bytes
|
|
|
|
|
|
WHEN:
|
|
|
|
|
|
- The file is parsed
|
|
|
|
|
|
THEN:
|
|
|
|
|
|
- Parsing continues
|
|
|
|
|
|
- Invalid bytes are removed
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
text_parser.parse(malformed_txt_file, "text/plain")
|
2023-05-12 14:21:32 -07:00
|
|
|
|
|
2024-07-08 07:46:20 -07:00
|
|
|
|
assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
|
|
|
|
|
|
assert text_parser.get_archive_path() is None
|
2025-08-01 13:26:35 -04:00
|
|
|
|
|
|
|
|
|
|
def test_thumbnail_large_file(self, text_parser: TextDocumentParser):
|
|
|
|
|
|
"""
|
|
|
|
|
|
GIVEN:
|
|
|
|
|
|
- A very large text file (>50MB)
|
|
|
|
|
|
WHEN:
|
|
|
|
|
|
- A thumbnail is requested
|
|
|
|
|
|
THEN:
|
|
|
|
|
|
- A thumbnail is created without reading the entire file into memory
|
|
|
|
|
|
"""
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(
|
|
|
|
|
|
delete=False,
|
|
|
|
|
|
mode="w",
|
|
|
|
|
|
encoding="utf-8",
|
|
|
|
|
|
suffix=".txt",
|
|
|
|
|
|
) as tmp:
|
|
|
|
|
|
tmp.write("A" * (51 * 1024 * 1024)) # 51 MB of 'A'
|
|
|
|
|
|
large_file = Path(tmp.name)
|
|
|
|
|
|
|
|
|
|
|
|
thumb = text_parser.get_thumbnail(large_file, "text/plain")
|
|
|
|
|
|
assert thumb.exists()
|
|
|
|
|
|
assert thumb.is_file()
|
|
|
|
|
|
large_file.unlink()
|