paperless-ngx/src/documents/tests/test_consumer.py

import os
from unittest import mock, skipIf

import pyocr
from django.test import TestCase
from pyocr.libtesseract.tesseract_raw import \
    TesseractError as OtherTesseractError

from ..models import FileInfo
from ..consumer import image_to_string, strip_excess_whitespace


class TestAttributes(TestCase):

    TAGS = ("tag1", "tag2", "tag3")
    EXTENSIONS = (
        "pdf", "png", "jpg", "jpeg", "gif",
        "PDF", "PNG", "JPG", "JPEG", "GIF",
        "PdF", "PnG", "JpG", "JPeG", "GiF",
    )

    def _test_guess_attributes_from_name(self, path, sender, title, tags):

        for extension in self.EXTENSIONS:

            f = path.format(extension)
            file_info = FileInfo.from_path(f)

            if sender:
                self.assertEqual(file_info.correspondent.name, sender, f)
            else:
                self.assertIsNone(file_info.correspondent, f)

            self.assertEqual(file_info.title, title, f)

            self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
            if extension.lower() == "jpeg":
                self.assertEqual(file_info.extension, "jpg", f)
            else:
                self.assertEqual(file_info.extension, extension.lower(), f)

    def test_guess_attributes_from_name0(self):
        self._test_guess_attributes_from_name(
            "/path/to/Sender - Title.{}", "Sender", "Title", ())

    def test_guess_attributes_from_name1(self):
        self._test_guess_attributes_from_name(
            "/path/to/Spaced Sender - Title.{}", "Spaced Sender", "Title", ())

    def test_guess_attributes_from_name2(self):
        self._test_guess_attributes_from_name(
            "/path/to/Sender - Spaced Title.{}", "Sender", "Spaced Title", ())

    def test_guess_attributes_from_name3(self):
        self._test_guess_attributes_from_name(
            "/path/to/Dashed-Sender - Title.{}", "Dashed-Sender", "Title", ())

    def test_guess_attributes_from_name4(self):
        self._test_guess_attributes_from_name(
            "/path/to/Sender - Dashed-Title.{}", "Sender", "Dashed-Title", ())

    def test_guess_attributes_from_name5(self):
        self._test_guess_attributes_from_name(
            "/path/to/Sender - Title - tag1,tag2,tag3.{}",
            "Sender",
            "Title",
            self.TAGS
        )

    def test_guess_attributes_from_name6(self):
        self._test_guess_attributes_from_name(
            "/path/to/Spaced Sender - Title - tag1,tag2,tag3.{}",
            "Spaced Sender",
            "Title",
            self.TAGS
        )

    def test_guess_attributes_from_name7(self):
        self._test_guess_attributes_from_name(
            "/path/to/Sender - Spaced Title - tag1,tag2,tag3.{}",
            "Sender",
            "Spaced Title",
            self.TAGS
        )

    def test_guess_attributes_from_name8(self):
        self._test_guess_attributes_from_name(
            "/path/to/Dashed-Sender - Title - tag1,tag2,tag3.{}",
            "Dashed-Sender",
            "Title",
            self.TAGS
        )

    def test_guess_attributes_from_name9(self):
        self._test_guess_attributes_from_name(
            "/path/to/Sender - Dashed-Title - tag1,tag2,tag3.{}",
            "Sender",
            "Dashed-Title",
            self.TAGS
        )

    def test_guess_attributes_from_name10(self):
        self._test_guess_attributes_from_name(
            "/path/to/Σενδερ - Τιτλε - tag1,tag2,tag3.{}",
            "Σενδερ",
            "Τιτλε",
            self.TAGS
        )

    def test_guess_attributes_from_name_when_correspondent_empty(self):
        self._test_guess_attributes_from_name(
            '/path/to/ - weird empty correspondent but should not break.{}',
            None,
            'weird empty correspondent but should not break',
            ()
        )

    def test_guess_attributes_from_name_when_title_starts_with_dash(self):
        self._test_guess_attributes_from_name(
            '/path/to/- weird but should not break.{}',
            None,
            '- weird but should not break',
            ()
        )

    def test_guess_attributes_from_name_when_title_ends_with_dash(self):
        self._test_guess_attributes_from_name(
            '/path/to/weird but should not break -.{}',
            None,
            'weird but should not break -',
            ()
        )

    def test_guess_attributes_from_name_when_title_is_empty(self):
        self._test_guess_attributes_from_name(
            '/path/to/weird correspondent but should not break - .{}',
            'weird correspondent but should not break',
            '',
            ()
        )


class TestFieldPermutations(TestCase):

    valid_dates = (
        "20150102030405Z",
        "20150102Z",
    )
    valid_correspondents = [
        "timmy",
        "Dr. McWheelie",
        "Dash Gor-don",
        "ο Θερμαστής",
        ""
    ]
    valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
    valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
    valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"]

    def _test_guessed_attributes(self, filename, created=None,
                                 correspondent=None, title=None,
                                 extension=None, tags=None):

        info = FileInfo.from_path(filename)

        # Created
        if created is None:
            self.assertIsNone(info.created, filename)
        else:
            self.assertEqual(info.created.year, int(created[:4]), filename)
            self.assertEqual(info.created.month, int(created[4:6]), filename)
            self.assertEqual(info.created.day, int(created[6:8]), filename)

        # Correspondent
        if correspondent:
            self.assertEqual(info.correspondent.name, correspondent, filename)
        else:
            self.assertEqual(info.correspondent, None, filename)

        # Title
        self.assertEqual(info.title, title, filename)

        # Tags
        if tags is None:
            self.assertEqual(info.tags, (), filename)
        else:
            self.assertEqual(
                [t.slug for t in info.tags], tags.split(','),
                filename
            )

        # Extension
        if extension == 'jpeg':
            extension = 'jpg'
        self.assertEqual(info.extension, extension, filename)

    def test_just_title(self):
        template = '/path/to/{title}.{extension}'
        for title in self.valid_titles:
            for extension in self.valid_extensions:
                spec = dict(title=title, extension=extension)
                filename = template.format(**spec)
                self._test_guessed_attributes(filename, **spec)

    def test_title_and_correspondent(self):
        template = '/path/to/{correspondent} - {title}.{extension}'
        for correspondent in self.valid_correspondents:
            for title in self.valid_titles:
                for extension in self.valid_extensions:
                    spec = dict(correspondent=correspondent, title=title,
                                extension=extension)
                    filename = template.format(**spec)
                    self._test_guessed_attributes(filename, **spec)

    def test_title_and_correspondent_and_tags(self):
        template = '/path/to/{correspondent} - {title} - {tags}.{extension}'
        for correspondent in self.valid_correspondents:
            for title in self.valid_titles:
                for tags in self.valid_tags:
                    for extension in self.valid_extensions:
                        spec = dict(correspondent=correspondent, title=title,
                                    tags=tags, extension=extension)
                        filename = template.format(**spec)
                        self._test_guessed_attributes(filename, **spec)

    def test_created_and_correspondent_and_title_and_tags(self):

        template = ("/path/to/{created} - "
                    "{correspondent} - "
                    "{title} - "
                    "{tags}"
                    ".{extension}")

        for created in self.valid_dates:
            for correspondent in self.valid_correspondents:
                for title in self.valid_titles:
                    for tags in self.valid_tags:
                        for extension in self.valid_extensions:
                            spec = {
                                "created": created,
                                "correspondent": correspondent,
                                "title": title,
                                "tags": tags,
                                "extension": extension
                            }
                            self._test_guessed_attributes(
                                template.format(**spec), **spec)

    def test_created_and_correspondent_and_title(self):

        template = ("/path/to/{created} - "
                    "{correspondent} - "
                    "{title}"
                    ".{extension}")

        for created in self.valid_dates:
            for correspondent in self.valid_correspondents:
                for title in self.valid_titles:

                    # Skip cases where title looks like a tag as we can't
                    # accommodate such cases.
                    if title.lower() == title:
                        continue

                    for extension in self.valid_extensions:
                        spec = {
                            "created": created,
                            "correspondent": correspondent,
                            "title": title,
                            "extension": extension
                        }
                        self._test_guessed_attributes(
                            template.format(**spec), **spec)

    def test_created_and_title(self):

        template = ("/path/to/{created} - "
                    "{title}"
                    ".{extension}")

        for created in self.valid_dates:
            for title in self.valid_titles:
                for extension in self.valid_extensions:
                    spec = {
                        "created": created,
                        "title": title,
                        "extension": extension
                    }
                    self._test_guessed_attributes(
                        template.format(**spec), **spec)

    def test_created_and_title_and_tags(self):

        template = ("/path/to/{created} - "
                    "{title} - "
                    "{tags}"
                    ".{extension}")

        for created in self.valid_dates:
            for title in self.valid_titles:
                for tags in self.valid_tags:
                    for extension in self.valid_extensions:
                        spec = {
                            "created": created,
                            "title": title,
                            "tags": tags,
                            "extension": extension
                        }
                        self._test_guessed_attributes(
                            template.format(**spec), **spec)


class FakeTesseract(object):

    @staticmethod
    def can_detect_orientation():
        return True

    @staticmethod
    def detect_orientation(file_handle, lang):
        raise OtherTesseractError("arbitrary status", "message")

    @staticmethod
    def image_to_string(file_handle, lang):
        return "This is test text"


class FakePyOcr(object):

    @staticmethod
    def get_available_tools():
        return [FakeTesseract]


class TestOCR(TestCase):

    text_cases = [
        ("simple     string", "simple string"),
        (
            "simple    newline\n   testing string",
            "simple newline\ntesting string"
        ),
        (
            "utf-8   строка с пробелами в конце  ",
            "utf-8 строка с пробелами в конце"
        )
    ]

    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())

    def test_strip_excess_whitespace(self):
        for source, result in self.text_cases:
            actual_result = strip_excess_whitespace(source)
            self.assertEqual(
                result,
                actual_result,
                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
                    source,
                    result,
                    actual_result
                )
            )

    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
    @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
    @mock.patch("documents.consumer.pyocr", FakePyOcr)
    def test_image_to_string_with_text_free_page(self):
        """
        This test is sort of silly, since it's really just reproducing an odd
        exception thrown by pyocr when it encounters a page with no text.
        Actually running this test against an installation of Tesseract results
        in a segmentation fault rooted somewhere deep inside pyocr where I
        don't care to dig.  Regardless, if you run the consumer normally,
        text-free pages are now handled correctly so long as we work around
        this weird exception.
        """
        image_to_string(["text.png", "en"])
-												Fix for #154

* Added a test with a faked pyocr and tesseract
* Added a catch for pyocr's *other* TesseractError

											
										
										
											2016-11-27 15:06:45 +00:00
+								import os
 								from unittest import mock, skipIf
 								import pyocr
-												Tests for the consumer

											
										
										
											2016-02-11 12:25:23 +00:00
+								from django.test import TestCase
-												Fix for #154

* Added a test with a faked pyocr and tesseract
* Added a catch for pyocr's *other* TesseractError

											
										
										
											2016-11-27 15:06:45 +00:00
+								from pyocr.libtesseract.tesseract_raw import \
 								    TesseractError as OtherTesseractError
-												Tests for the consumer

											
										
										
											2016-02-11 12:25:23 +00:00
-												Fix for #131: delete files on document.delete

											
										
										
											2016-08-16 19:13:37 +01:00
+								from ..models import FileInfo
-												Fix for #154

* Added a test with a faked pyocr and tesseract
* Added a catch for pyocr's *other* TesseractError

											
										
										
											2016-11-27 15:06:45 +00:00
+								from ..consumer import image_to_string, strip_excess_whitespace
-												Tests for the consumer

											
										
										
											2016-02-11 12:25:23 +00:00
-												Actually write the date found in the file name

											
										
										
											2016-08-20 18:11:51 +01:00
+								class TestAttributes(TestCase):
-												pep8

											
										
										
											2016-02-21 00:14:50 +00:00
-												Fixed the auto-naming regexes

											
										
										
											2016-02-11 22:05:55 +00:00
+								    TAGS = ("tag1", "tag2", "tag3")
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								    EXTENSIONS = (
-												Compensate for case and format of jpg vs. jpeg

											
										
										
											2016-02-23 20:15:13 +00:00
+								        "pdf", "png", "jpg", "jpeg", "gif",
 								        "PDF", "PNG", "JPG", "JPEG", "GIF",
 								        "PdF", "PnG", "JpG", "JPeG", "GiF",
 								    )
-												pep8

											
										
										
											2016-02-21 00:14:50 +00:00
-												Fixed the auto-naming regexes

											
										
										
											2016-02-11 22:05:55 +00:00
+								    def _test_guess_attributes_from_name(self, path, sender, title, tags):
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
 								        for extension in self.EXTENSIONS:
 								            f = path.format(extension)
-												Make tests pass

											
										
										
											2016-03-07 21:37:18 +02:00
+								            file_info = FileInfo.from_path(f)
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
 								            if sender:
 								                self.assertEqual(file_info.correspondent.name, sender, f)
 								            else:
 								                self.assertIsNone(file_info.correspondent, f)
-												Make tests pass

											
										
										
											2016-03-07 21:37:18 +02:00
+								            self.assertEqual(file_info.title, title, f)
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
-												Make tests pass

											
										
										
											2016-03-07 21:37:18 +02:00
+								            self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, f)
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								            if extension.lower() == "jpeg":
 								                self.assertEqual(file_info.extension, "jpg", f)
-												Compensate for case and format of jpg vs. jpeg

											
										
										
											2016-02-23 20:15:13 +00:00
+								            else:
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								                self.assertEqual(file_info.extension, extension.lower(), f)
-												Tests for the consumer

											
										
										
											2016-02-11 12:25:23 +00:00
-												Fixed the auto-naming regexes

											
										
										
											2016-02-11 22:05:55 +00:00
+								    def test_guess_attributes_from_name0(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Sender - Title.{}", "Sender", "Title", ())
 								    def test_guess_attributes_from_name1(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Spaced Sender - Title.{}", "Spaced Sender", "Title", ())
 								    def test_guess_attributes_from_name2(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Sender - Spaced Title.{}", "Sender", "Spaced Title", ())
 								    def test_guess_attributes_from_name3(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Dashed-Sender - Title.{}", "Dashed-Sender", "Title", ())
 								    def test_guess_attributes_from_name4(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Sender - Dashed-Title.{}", "Sender", "Dashed-Title", ())
 								    def test_guess_attributes_from_name5(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Sender - Title - tag1,tag2,tag3.{}",
 								            "Sender",
 								            "Title",
 								            self.TAGS
 								        )
 								    def test_guess_attributes_from_name6(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Spaced Sender - Title - tag1,tag2,tag3.{}",
 								            "Spaced Sender",
 								            "Title",
 								            self.TAGS
 								        )
 								    def test_guess_attributes_from_name7(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Sender - Spaced Title - tag1,tag2,tag3.{}",
 								            "Sender",
 								            "Spaced Title",
 								            self.TAGS
 								        )
 								    def test_guess_attributes_from_name8(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Dashed-Sender - Title - tag1,tag2,tag3.{}",
 								            "Dashed-Sender",
 								            "Title",
 								            self.TAGS
 								        )
 								    def test_guess_attributes_from_name9(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Sender - Dashed-Title - tag1,tag2,tag3.{}",
 								            "Sender",
 								            "Dashed-Title",
 								            self.TAGS
 								        )
 								    def test_guess_attributes_from_name10(self):
 								        self._test_guess_attributes_from_name(
 								            "/path/to/Σενδερ - Τιτλε - tag1,tag2,tag3.{}",
 								            "Σενδερ",
 								            "Τιτλε",
 								            self.TAGS
-												Tests for the consumer

											
										
										
											2016-02-11 12:25:23 +00:00
+								        )
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
-												Add some failing edge case tests

											
										
										
											2016-03-07 21:48:47 +02:00
+								    def test_guess_attributes_from_name_when_correspondent_empty(self):
 								        self._test_guess_attributes_from_name(
 								            '/path/to/ - weird empty correspondent but should not break.{}',
 								            None,
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								            'weird empty correspondent but should not break',
-												Add some failing edge case tests

											
										
										
											2016-03-07 21:48:47 +02:00
+								            ()
 								        )
 								    def test_guess_attributes_from_name_when_title_starts_with_dash(self):
 								        self._test_guess_attributes_from_name(
 								            '/path/to/- weird but should not break.{}',
 								            None,
 								            '- weird but should not break',
 								            ()
 								        )
 								    def test_guess_attributes_from_name_when_title_ends_with_dash(self):
 								        self._test_guess_attributes_from_name(
 								            '/path/to/weird but should not break -.{}',
 								            None,
 								            'weird but should not break -',
 								            ()
 								        )
 								    def test_guess_attributes_from_name_when_title_is_empty(self):
 								        self._test_guess_attributes_from_name(
 								            '/path/to/weird correspondent but should not break - .{}',
 								            'weird correspondent but should not break',
 								            '',
 								            ()
 								        )
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
-												pep8 corrections

											
										
										
											2016-10-26 09:32:59 +00:00
+								class TestFieldPermutations(TestCase):
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
 								    valid_dates = (
 								        "20150102030405Z",
 								        "20150102Z",
 								    )
 								    valid_correspondents = [
 								        "timmy",
 								        "Dr. McWheelie",
 								        "Dash Gor-don",
 								        "ο Θερμαστής",
 								        ""
 								    ]
 								    valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
 								    valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
 								    valid_extensions = ["pdf", "png", "jpg", "jpeg", "gif"]
 								    def _test_guessed_attributes(self, filename, created=None,
 								                                 correspondent=None, title=None,
 								                                 extension=None, tags=None):
 								        info = FileInfo.from_path(filename)
 								        # Created
 								        if created is None:
 								            self.assertIsNone(info.created, filename)
 								        else:
 								            self.assertEqual(info.created.year, int(created[:4]), filename)
 								            self.assertEqual(info.created.month, int(created[4:6]), filename)
 								            self.assertEqual(info.created.day, int(created[6:8]), filename)
 								        # Correspondent
 								        if correspondent:
 								            self.assertEqual(info.correspondent.name, correspondent, filename)
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								        else:
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								            self.assertEqual(info.correspondent, None, filename)
 								        # Title
 								        self.assertEqual(info.title, title, filename)
 								        # Tags
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								        if tags is None:
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								            self.assertEqual(info.tags, (), filename)
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								        else:
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								            self.assertEqual(
 								                [t.slug for t in info.tags], tags.split(','),
 								                filename
 								            )
 								        # Extension
 								        if extension == 'jpeg':
 								            extension = 'jpg'
 								        self.assertEqual(info.extension, extension, filename)
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
 								    def test_just_title(self):
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								        template = '/path/to/{title}.{extension}'
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								        for title in self.valid_titles:
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								            for extension in self.valid_extensions:
 								                spec = dict(title=title, extension=extension)
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								                filename = template.format(**spec)
 								                self._test_guessed_attributes(filename, **spec)
 								    def test_title_and_correspondent(self):
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								        template = '/path/to/{correspondent} - {title}.{extension}'
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								        for correspondent in self.valid_correspondents:
 								            for title in self.valid_titles:
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								                for extension in self.valid_extensions:
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								                    spec = dict(correspondent=correspondent, title=title,
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								                                extension=extension)
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								                    filename = template.format(**spec)
 								                    self._test_guessed_attributes(filename, **spec)
 								    def test_title_and_correspondent_and_tags(self):
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								        template = '/path/to/{correspondent} - {title} - {tags}.{extension}'
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								        for correspondent in self.valid_correspondents:
 								            for title in self.valid_titles:
 								                for tags in self.valid_tags:
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								                    for extension in self.valid_extensions:
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								                        spec = dict(correspondent=correspondent, title=title,
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
+								                                    tags=tags, extension=extension)
-												Template-based tests of combinations of valid elements

											
										
										
											2016-03-07 21:42:52 +02:00
+								                        filename = template.format(**spec)
 								                        self._test_guessed_attributes(filename, **spec)
-												Modifications for support for dates

											
										
										
											2016-03-24 19:18:33 +00:00
 								    def test_created_and_correspondent_and_title_and_tags(self):
 								        template = ("/path/to/{created} - "
 								                    "{correspondent} - "
 								                    "{title} - "
 								                    "{tags}"
 								                    ".{extension}")
 								        for created in self.valid_dates:
 								            for correspondent in self.valid_correspondents:
 								                for title in self.valid_titles:
 								                    for tags in self.valid_tags:
 								                        for extension in self.valid_extensions:
 								                            spec = {
 								                                "created": created,
 								                                "correspondent": correspondent,
 								                                "title": title,
 								                                "tags": tags,
 								                                "extension": extension
 								                            }
 								                            self._test_guessed_attributes(
 								                                template.format(**spec), **spec)
 								    def test_created_and_correspondent_and_title(self):
 								        template = ("/path/to/{created} - "
 								                    "{correspondent} - "
 								                    "{title}"
 								                    ".{extension}")
 								        for created in self.valid_dates:
 								            for correspondent in self.valid_correspondents:
 								                for title in self.valid_titles:
 								                    # Skip cases where title looks like a tag as we can't
 								                    # accommodate such cases.
 								                    if title.lower() == title:
 								                        continue
 								                    for extension in self.valid_extensions:
 								                        spec = {
 								                            "created": created,
 								                            "correspondent": correspondent,
 								                            "title": title,
 								                            "extension": extension
 								                        }
 								                        self._test_guessed_attributes(
 								                            template.format(**spec), **spec)
 								    def test_created_and_title(self):
 								        template = ("/path/to/{created} - "
 								                    "{title}"
 								                    ".{extension}")
 								        for created in self.valid_dates:
 								            for title in self.valid_titles:
 								                for extension in self.valid_extensions:
 								                    spec = {
 								                        "created": created,
 								                        "title": title,
 								                        "extension": extension
 								                    }
 								                    self._test_guessed_attributes(
 								                        template.format(**spec), **spec)
 								    def test_created_and_title_and_tags(self):
 								        template = ("/path/to/{created} - "
 								                    "{title} - "
 								                    "{tags}"
 								                    ".{extension}")
 								        for created in self.valid_dates:
 								            for title in self.valid_titles:
 								                for tags in self.valid_tags:
 								                    for extension in self.valid_extensions:
 								                        spec = {
 								                            "created": created,
 								                            "title": title,
 								                            "tags": tags,
 								                            "extension": extension
 								                        }
 								                        self._test_guessed_attributes(
 								                            template.format(**spec), **spec)
-												Collapsing excess whitespace after OCR

											
										
										
											2016-10-12 01:46:34 +02:00
-												Fix for #154

* Added a test with a faked pyocr and tesseract
* Added a catch for pyocr's *other* TesseractError

											
										
										
											2016-11-27 15:06:45 +00:00
+								class FakeTesseract(object):
 								    @staticmethod
 								    def can_detect_orientation():
 								        return True
 								    @staticmethod
 								    def detect_orientation(file_handle, lang):
 								        raise OtherTesseractError("arbitrary status", "message")
 								    @staticmethod
 								    def image_to_string(file_handle, lang):
 								        return "This is test text"
 								class FakePyOcr(object):
 								    @staticmethod
 								    def get_available_tools():
 								        return [FakeTesseract]
-												Collapsing excess whitespace after OCR

											
										
										
											2016-10-12 01:46:34 +02:00
+								class TestOCR(TestCase):
-												pep8 corrections

											
										
										
											2016-10-26 09:32:59 +00:00
-												Collapsing excess whitespace after OCR

											
										
										
											2016-10-12 01:46:34 +02:00
+								    text_cases = [
 								        ("simple     string", "simple string"),
-												pep8 corrections

											
										
										
											2016-10-26 09:32:59 +00:00
+								        (
 								            "simple    newline\n   testing string",
 								            "simple newline\ntesting string"
 								        ),
 								        (
 								            "utf-8   строка с пробелами в конце  ",
 								            "utf-8 строка с пробелами в конце"
 								        )
-												Collapsing excess whitespace after OCR

											
										
										
											2016-10-12 01:46:34 +02:00
+								    ]
-												pep8

											
										
										
											2016-11-27 15:10:07 +00:00
-												Fix for #154

* Added a test with a faked pyocr and tesseract
* Added a catch for pyocr's *other* TesseractError

											
										
										
											2016-11-27 15:06:45 +00:00
+								    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
 								    TESSERACT_INSTALLED = bool(pyocr.get_available_tools())
-												Collapsing excess whitespace after OCR

											
										
										
											2016-10-12 01:46:34 +02:00
 								    def test_strip_excess_whitespace(self):
 								        for source, result in self.text_cases:
 								            actual_result = strip_excess_whitespace(source)
-												pep8 corrections

											
										
										
											2016-10-26 09:32:59 +00:00
+								            self.assertEqual(
 								                result,
 								                actual_result,
 								                "strip_exceess_whitespace({}) != '{}', but '{}'".format(
 								                    source,
 								                    result,
 								                    actual_result
 								                )
 								            )
-												Fix for #154

* Added a test with a faked pyocr and tesseract
* Added a catch for pyocr's *other* TesseractError

											
										
										
											2016-11-27 15:06:45 +00:00
 								    @skipIf(not TESSERACT_INSTALLED, "Tesseract not installed. Skipping")
 								    @mock.patch("documents.consumer.Consumer.SCRATCH", SAMPLE_FILES)
 								    @mock.patch("documents.consumer.pyocr", FakePyOcr)
 								    def test_image_to_string_with_text_free_page(self):
 								        """
 								        This test is sort of silly, since it's really just reproducing an odd
 								        exception thrown by pyocr when it encounters a page with no text.
 								        Actually running this test against an installation of Tesseract results
 								        in a segmentation fault rooted somewhere deep inside pyocr where I
 								        don't care to dig.  Regardless, if you run the consumer normally,
 								        text-free pages are now handled correctly so long as we work around
 								        this weird exception.
 								        """
 								        image_to_string(["text.png", "en"])