paperless-ngx/src/documents/tests/test_consumer.py

379 lines
13 KiB
Python
Raw Normal View History

import os
from unittest import mock, skipIf
import pyocr
2016-02-11 12:25:23 +00:00
from django.test import TestCase
from pyocr.libtesseract.tesseract_raw import \
TesseractError as OtherTesseractError
2016-02-11 12:25:23 +00:00
from ..models import FileInfo
from ..consumer import image_to_string, strip_excess_whitespace
2016-02-11 12:25:23 +00:00
class TestAttributes(TestCase):
2016-02-21 00:14:50 +00:00
2016-02-11 22:05:55 +00:00
TAGS = ("tag1", "tag2", "tag3")
2016-03-24 19:18:33 +00:00
EXTENSIONS = (
"pdf", "png", "jpg", "jpeg", "gif",
"PDF", "PNG", "JPG", "JPEG", "GIF",
"PdF", "PnG", "JpG", "JPeG", "GiF",
)
2016-02-21 00:14:50 +00:00
2016-02-11 22:05:55 +00:00
def _test_guess_attributes_from_name(self, path, sender, title, tags):
2016-03-24 19:18:33 +00:00
for extension in self.EXTENSIONS:
f = path.format(extension)
2016-03-07 21:37:18 +02:00
file_info = FileInfo.from_path(f)
2016-03-24 19:18:33 +00:00
if sender:
self.assertEqual(file_info.correspondent.name, sender, f)
else:
self.assertIsNone(file_info.correspondent, f)
2016-03-07 21:37:18 +02:00
self.assertEqual(file_info.title, title, f)
2016-03-24 19:18:33 +00:00
2016-03-07 21:37:18 +02:00
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
2016-03-24 19:18:33 +00:00
if extension.lower() == "jpeg":
self.assertEqual(file_info.extension, "jpg", f)
else:
2016-03-24 19:18:33 +00:00
self.assertEqual(file_info.extension, extension.lower(), f)
2016-02-11 12:25:23 +00:00
2016-02-11 22:05:55 +00:00
def test_guess_attributes_from_name0(self):
self._test_guess_attributes_from_name(
"/path/to/Sender - Title.{}", "Sender", "Title", ())
def test_guess_attributes_from_name1(self):
self._test_guess_attributes_from_name(
"/path/to/Spaced Sender - Title.{}", "Spaced Sender", "Title", ())
def test_guess_attributes_from_name2(self):
self._test_guess_attributes_from_name(
"/path/to/Sender - Spaced Title.{}", "Sender", "Spaced Title", ())
def test_guess_attributes_from_name3(self):
self._test_guess_attributes_from_name(
"/path/to/Dashed-Sender - Title.{}", "Dashed-Sender", "Title", ())
def test_guess_attributes_from_name4(self):
self._test_guess_attributes_from_name(
"/path/to/Sender - Dashed-Title.{}", "Sender", "Dashed-Title", ())
def test_guess_attributes_from_name5(self):
self._test_guess_attributes_from_name(
"/path/to/Sender - Title - tag1,tag2,tag3.{}",
"Sender",
"Title",
self.TAGS
)
def test_guess_attributes_from_name6(self):
self._test_guess_attributes_from_name(
"/path/to/Spaced Sender - Title - tag1,tag2,tag3.{}",
"Spaced Sender",
"Title",
self.TAGS
)
def test_guess_attributes_from_name7(self):
self._test_guess_attributes_from_name(
"/path/to/Sender - Spaced Title - tag1,tag2,tag3.{}",
"Sender",
"Spaced Title",
self.TAGS
)
def test_guess_attributes_from_name8(self):
self._test_guess_attributes_from_name(
"/path/to/Dashed-Sender - Title - tag1,tag2,tag3.{}",
"Dashed-Sender",
"Title",
self.TAGS
)
def test_guess_attributes_from_name9(self):
self._test_guess_attributes_from_name(
"/path/to/Sender - Dashed-Title - tag1,tag2,tag3.{}",
"Sender",
"Dashed-Title",
self.TAGS
)
def test_guess_attributes_from_name10(self):
self._test_guess_attributes_from_name(
"/path/to/Σενδερ - Τιτλε - tag1,tag2,tag3.{}",
"Σενδερ",
"Τιτλε",
self.TAGS
2016-02-11 12:25:23 +00:00
)
2016-03-07 21:48:47 +02:00
def test_guess_attributes_from_name_when_correspondent_empty(self):
self._test_guess_attributes_from_name(
'/path/to/ - weird empty correspondent but should not break.{}',
None,
2016-03-24 19:18:33 +00:00
'weird empty correspondent but should not break',
2016-03-07 21:48:47 +02:00
()
)
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
self._test_guess_attributes_from_name(
'/path/to/- weird but should not break.{}',
None,
'- weird but should not break',
()
)
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
self._test_guess_attributes_from_name(
'/path/to/weird but should not break -.{}',
None,
'weird but should not break -',
()
)
def test_guess_attributes_from_name_when_title_is_empty(self):
self._test_guess_attributes_from_name(
'/path/to/weird correspondent but should not break - .{}',
'weird correspondent but should not break',
'',
()
)
2016-10-26 09:32:59 +00:00
class TestFieldPermutations(TestCase):
2016-03-24 19:18:33 +00:00
valid_dates = (
"20150102030405Z",
"20150102Z",
)
valid_correspondents = [
"timmy",
"Dr. McWheelie",
"Dash Gor-don",
"ο Θερμαστής",
""
]
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"]
def _test_guessed_attributes(self, filename, created=None,
correspondent=None, title=None,
extension=None, tags=None):
info = FileInfo.from_path(filename)
# Created
if created is None:
self.assertIsNone(info.created, filename)
else:
self.assertEqual(info.created.year, int(created[:4]), filename)
self.assertEqual(info.created.month, int(created[4:6]), filename)
self.assertEqual(info.created.day, int(created[6:8]), filename)
# Correspondent
if correspondent:
self.assertEqual(info.correspondent.name, correspondent, filename)
else:
2016-03-24 19:18:33 +00:00
self.assertEqual(info.correspondent, None, filename)
# Title
self.assertEqual(info.title, title, filename)
# Tags
if tags is None:
2016-03-24 19:18:33 +00:00
self.assertEqual(info.tags, (), filename)
else:
2016-03-24 19:18:33 +00:00
self.assertEqual(
[t.slug for t in info.tags], tags.split(','),
filename
)
# Extension
if extension == 'jpeg':
extension = 'jpg'
self.assertEqual(info.extension, extension, filename)
def test_just_title(self):
2016-03-24 19:18:33 +00:00
template = '/path/to/{title}.{extension}'
for title in self.valid_titles:
2016-03-24 19:18:33 +00:00
for extension in self.valid_extensions:
spec = dict(title=title, extension=extension)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_title_and_correspondent(self):
2016-03-24 19:18:33 +00:00
template = '/path/to/{correspondent} - {title}.{extension}'
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
2016-03-24 19:18:33 +00:00
for extension in self.valid_extensions:
spec = dict(correspondent=correspondent, title=title,
2016-03-24 19:18:33 +00:00
extension=extension)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
def test_title_and_correspondent_and_tags(self):
2016-03-24 19:18:33 +00:00
template = '/path/to/{correspondent} - {title} - {tags}.{extension}'
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for tags in self.valid_tags:
2016-03-24 19:18:33 +00:00
for extension in self.valid_extensions:
spec = dict(correspondent=correspondent, title=title,
2016-03-24 19:18:33 +00:00
tags=tags, extension=extension)
filename = template.format(**spec)
self._test_guessed_attributes(filename, **spec)
2016-03-24 19:18:33 +00:00
def test_created_and_correspondent_and_title_and_tags(self):
template = ("/path/to/{created} - "
"{correspondent} - "
"{title} - "
"{tags}"
".{extension}")
for created in self.valid_dates:
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
for tags in self.valid_tags:
for extension in self.valid_extensions:
spec = {
"created": created,
"correspondent": correspondent,
"title": title,
"tags": tags,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
def test_created_and_correspondent_and_title(self):
template = ("/path/to/{created} - "
"{correspondent} - "
"{title}"
".{extension}")
for created in self.valid_dates:
for correspondent in self.valid_correspondents:
for title in self.valid_titles:
# Skip cases where title looks like a tag as we can't
# accommodate such cases.
if title.lower() == title:
continue
for extension in self.valid_extensions:
spec = {
"created": created,
"correspondent": correspondent,
"title": title,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
def test_created_and_title(self):
template = ("/path/to/{created} - "
"{title}"
".{extension}")
for created in self.valid_dates:
for title in self.valid_titles:
for extension in self.valid_extensions:
spec = {
"created": created,
"title": title,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
def test_created_and_title_and_tags(self):
template = ("/path/to/{created} - "
"{title} - "
"{tags}"
".{extension}")
for created in self.valid_dates:
for title in self.valid_titles:
for tags in self.valid_tags:
for extension in self.valid_extensions:
spec = {
"created": created,
"title": title,
"tags": tags,
"extension": extension
}
self._test_guessed_attributes(
template.format(**spec), **spec)
2016-10-12 01:46:34 +02:00
class FakeTesseract(object):
@staticmethod
def can_detect_orientation():
return True
@staticmethod
def detect_orientation(file_handle, lang):
raise OtherTesseractError("arbitrary status", "message")
@staticmethod
def image_to_string(file_handle, lang):
return "This is test text"
class FakePyOcr(object):
@staticmethod
def get_available_tools():
return [FakeTesseract]
2016-10-12 01:46:34 +02:00
class TestOCR(TestCase):
2016-10-26 09:32:59 +00:00
2016-10-12 01:46:34 +02:00
text_cases = [
("simple string", "simple string"),
2016-10-26 09:32:59 +00:00
(
"simple newline\n testing string",
"simple newline\ntesting string"
),
(
"utf-8 строка с пробелами в конце ",
"utf-8 строка с пробелами в конце"
)
2016-10-12 01:46:34 +02:00
]
2016-11-27 15:10:07 +00:00
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
2016-10-12 01:46:34 +02:00
def test_strip_excess_whitespace(self):
for source, result in self.text_cases:
actual_result = strip_excess_whitespace(source)
2016-10-26 09:32:59 +00:00
self.assertEqual(
result,
actual_result,
"strip_exceess_whitespace({}) != '{}', but '{}'".format(
source,
result,
actual_result
)
)
@skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
@mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
@mock.patch("documents.consumer.pyocr", FakePyOcr)
def test_image_to_string_with_text_free_page(self):
"""
This test is sort of silly, since it's really just reproducing an odd
exception thrown by pyocr when it encounters a page with no text.
Actually running this test against an installation of Tesseract results
in a segmentation fault rooted somewhere deep inside pyocr where I
don't care to dig. Regardless, if you run the consumer normally,
text-free pages are now handled correctly so long as we work around
this weird exception.
"""
image_to_string(["text.png", "en"])