2020-11-16 23:16:37 +01:00
|
|
|
|
import os
|
2019-09-08 20:24:58 +02:00
|
|
|
|
import re
|
2020-11-16 23:16:37 +01:00
|
|
|
|
import shutil
|
|
|
|
|
|
import tempfile
|
|
|
|
|
|
from unittest import mock
|
|
|
|
|
|
from unittest.mock import MagicMock
|
2019-09-08 20:24:58 +02:00
|
|
|
|
|
2020-11-16 23:16:37 +01:00
|
|
|
|
from django.test import TestCase, override_settings
|
2016-02-11 12:25:23 +00:00
|
|
|
|
|
2020-11-16 23:16:37 +01:00
|
|
|
|
from ..consumer import Consumer, ConsumerError
|
|
|
|
|
|
from ..models import FileInfo, Tag, Correspondent, DocumentType, Document
|
|
|
|
|
|
from ..parsers import DocumentParser, ParseError
|
2016-02-11 12:25:23 +00:00
|
|
|
|
|
|
|
|
|
|
|
2016-08-20 18:11:51 +01:00
|
|
|
|
class TestAttributes(TestCase):
|
2016-02-21 00:14:50 +00:00
|
|
|
|
|
2016-02-11 22:05:55 +00:00
|
|
|
|
TAGS = ("tag1", "tag2", "tag3")
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
def _test_guess_attributes_from_name(self, filename, sender, title, tags):
|
|
|
|
|
|
file_info = FileInfo.from_filename(filename)
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
if sender:
|
|
|
|
|
|
self.assertEqual(file_info.correspondent.name, sender, filename)
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.assertIsNone(file_info.correspondent, filename)
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
self.assertEqual(file_info.title, title, filename)
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
self.assertEqual(tuple([t.slug for t in file_info.tags]), tags, filename)
|
2016-02-11 12:25:23 +00:00
|
|
|
|
|
2016-02-11 22:05:55 +00:00
|
|
|
|
def test_guess_attributes_from_name0(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Sender - Title.pdf", "Sender", "Title", ())
|
2016-02-11 22:05:55 +00:00
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name1(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Spaced Sender - Title.pdf", "Spaced Sender", "Title", ())
|
2016-02-11 22:05:55 +00:00
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name2(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Sender - Spaced Title.pdf", "Sender", "Spaced Title", ())
|
2016-02-11 22:05:55 +00:00
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name3(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Dashed-Sender - Title.pdf", "Dashed-Sender", "Title", ())
|
2016-02-11 22:05:55 +00:00
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name4(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Sender - Dashed-Title.pdf", "Sender", "Dashed-Title", ())
|
2016-02-11 22:05:55 +00:00
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name5(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Sender - Title - tag1,tag2,tag3.pdf",
|
2016-02-11 22:05:55 +00:00
|
|
|
|
"Sender",
|
|
|
|
|
|
"Title",
|
|
|
|
|
|
self.TAGS
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name6(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Spaced Sender - Title - tag1,tag2,tag3.pdf",
|
2016-02-11 22:05:55 +00:00
|
|
|
|
"Spaced Sender",
|
|
|
|
|
|
"Title",
|
|
|
|
|
|
self.TAGS
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name7(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Sender - Spaced Title - tag1,tag2,tag3.pdf",
|
2016-02-11 22:05:55 +00:00
|
|
|
|
"Sender",
|
|
|
|
|
|
"Spaced Title",
|
|
|
|
|
|
self.TAGS
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name8(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Dashed-Sender - Title - tag1,tag2,tag3.pdf",
|
2016-02-11 22:05:55 +00:00
|
|
|
|
"Dashed-Sender",
|
|
|
|
|
|
"Title",
|
|
|
|
|
|
self.TAGS
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name9(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Sender - Dashed-Title - tag1,tag2,tag3.pdf",
|
2016-02-11 22:05:55 +00:00
|
|
|
|
"Sender",
|
|
|
|
|
|
"Dashed-Title",
|
|
|
|
|
|
self.TAGS
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name10(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"Σενδερ - Τιτλε - tag1,tag2,tag3.pdf",
|
2016-02-11 22:05:55 +00:00
|
|
|
|
"Σενδερ",
|
|
|
|
|
|
"Τιτλε",
|
|
|
|
|
|
self.TAGS
|
2016-02-11 12:25:23 +00:00
|
|
|
|
)
|
2016-03-07 21:42:52 +02:00
|
|
|
|
|
2016-03-07 21:48:47 +02:00
|
|
|
|
def test_guess_attributes_from_name_when_correspondent_empty(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
' - weird empty correspondent but should not break.pdf',
|
2016-03-07 21:48:47 +02:00
|
|
|
|
None,
|
2016-03-24 19:18:33 +00:00
|
|
|
|
'weird empty correspondent but should not break',
|
2016-03-07 21:48:47 +02:00
|
|
|
|
()
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name_when_title_starts_with_dash(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
'- weird but should not break.pdf',
|
2016-03-07 21:48:47 +02:00
|
|
|
|
None,
|
|
|
|
|
|
'- weird but should not break',
|
|
|
|
|
|
()
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name_when_title_ends_with_dash(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
'weird but should not break -.pdf',
|
2016-03-07 21:48:47 +02:00
|
|
|
|
None,
|
|
|
|
|
|
'weird but should not break -',
|
|
|
|
|
|
()
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def test_guess_attributes_from_name_when_title_is_empty(self):
|
|
|
|
|
|
self._test_guess_attributes_from_name(
|
2020-11-20 16:18:59 +01:00
|
|
|
|
'weird correspondent but should not break - .pdf',
|
2016-03-07 21:48:47 +02:00
|
|
|
|
'weird correspondent but should not break',
|
|
|
|
|
|
'',
|
|
|
|
|
|
()
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2018-09-02 20:48:51 +01:00
|
|
|
|
def test_case_insensitive_tag_creation(self):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Tags should be detected and created as lower case.
|
|
|
|
|
|
:return:
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
filename = "Title - Correspondent - tAg1,TAG2.pdf"
|
|
|
|
|
|
self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
|
2018-09-02 20:48:51 +01:00
|
|
|
|
|
|
|
|
|
|
path = "Title - Correspondent - tag1,tag2.pdf"
|
2020-11-20 16:18:59 +01:00
|
|
|
|
self.assertEqual(len(FileInfo.from_filename(filename).tags), 2)
|
2018-09-02 20:48:51 +01:00
|
|
|
|
|
|
|
|
|
|
self.assertEqual(Tag.objects.all().count(), 2)
|
|
|
|
|
|
|
2016-03-07 21:42:52 +02:00
|
|
|
|
|
2016-10-26 09:32:59 +00:00
|
|
|
|
class TestFieldPermutations(TestCase):
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
valid_dates = (
|
|
|
|
|
|
"20150102030405Z",
|
|
|
|
|
|
"20150102Z",
|
|
|
|
|
|
)
|
|
|
|
|
|
valid_correspondents = [
|
|
|
|
|
|
"timmy",
|
|
|
|
|
|
"Dr. McWheelie",
|
|
|
|
|
|
"Dash Gor-don",
|
|
|
|
|
|
"ο Θερμαστής",
|
|
|
|
|
|
""
|
|
|
|
|
|
]
|
|
|
|
|
|
valid_titles = ["title", "Title w Spaces", "Title a-dash", "Τίτλος", ""]
|
|
|
|
|
|
valid_tags = ["tag", "tig,tag", "tag1,tag2,tag-3"]
|
|
|
|
|
|
|
|
|
|
|
|
def _test_guessed_attributes(self, filename, created=None,
|
|
|
|
|
|
correspondent=None, title=None,
|
2020-11-20 16:18:59 +01:00
|
|
|
|
tags=None):
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
info = FileInfo.from_filename(filename)
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
# Created
|
|
|
|
|
|
if created is None:
|
|
|
|
|
|
self.assertIsNone(info.created, filename)
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.assertEqual(info.created.year, int(created[:4]), filename)
|
|
|
|
|
|
self.assertEqual(info.created.month, int(created[4:6]), filename)
|
|
|
|
|
|
self.assertEqual(info.created.day, int(created[6:8]), filename)
|
|
|
|
|
|
|
|
|
|
|
|
# Correspondent
|
|
|
|
|
|
if correspondent:
|
|
|
|
|
|
self.assertEqual(info.correspondent.name, correspondent, filename)
|
2016-03-07 21:42:52 +02:00
|
|
|
|
else:
|
2016-03-24 19:18:33 +00:00
|
|
|
|
self.assertEqual(info.correspondent, None, filename)
|
|
|
|
|
|
|
|
|
|
|
|
# Title
|
|
|
|
|
|
self.assertEqual(info.title, title, filename)
|
|
|
|
|
|
|
|
|
|
|
|
# Tags
|
2016-03-07 21:42:52 +02:00
|
|
|
|
if tags is None:
|
2016-03-24 19:18:33 +00:00
|
|
|
|
self.assertEqual(info.tags, (), filename)
|
2016-03-07 21:42:52 +02:00
|
|
|
|
else:
|
2016-03-24 19:18:33 +00:00
|
|
|
|
self.assertEqual(
|
|
|
|
|
|
[t.slug for t in info.tags], tags.split(','),
|
|
|
|
|
|
filename
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2016-03-07 21:42:52 +02:00
|
|
|
|
def test_just_title(self):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
template = '{title}.pdf'
|
2016-03-07 21:42:52 +02:00
|
|
|
|
for title in self.valid_titles:
|
2020-11-20 16:18:59 +01:00
|
|
|
|
spec = dict(title=title)
|
|
|
|
|
|
filename = template.format(**spec)
|
|
|
|
|
|
self._test_guessed_attributes(filename, **spec)
|
2016-03-07 21:42:52 +02:00
|
|
|
|
|
|
|
|
|
|
def test_title_and_correspondent(self):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
template = '{correspondent} - {title}.pdf'
|
2016-03-07 21:42:52 +02:00
|
|
|
|
for correspondent in self.valid_correspondents:
|
|
|
|
|
|
for title in self.valid_titles:
|
2020-11-20 16:18:59 +01:00
|
|
|
|
spec = dict(correspondent=correspondent, title=title)
|
|
|
|
|
|
filename = template.format(**spec)
|
|
|
|
|
|
self._test_guessed_attributes(filename, **spec)
|
2016-03-07 21:42:52 +02:00
|
|
|
|
|
|
|
|
|
|
def test_title_and_correspondent_and_tags(self):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
template = '{correspondent} - {title} - {tags}.pdf'
|
2016-03-07 21:42:52 +02:00
|
|
|
|
for correspondent in self.valid_correspondents:
|
|
|
|
|
|
for title in self.valid_titles:
|
|
|
|
|
|
for tags in self.valid_tags:
|
2020-11-20 16:18:59 +01:00
|
|
|
|
spec = dict(correspondent=correspondent, title=title,
|
|
|
|
|
|
tags=tags)
|
|
|
|
|
|
filename = template.format(**spec)
|
|
|
|
|
|
self._test_guessed_attributes(filename, **spec)
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
def test_created_and_correspondent_and_title_and_tags(self):
|
|
|
|
|
|
|
2018-04-22 16:28:21 +01:00
|
|
|
|
template = (
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"{created} - "
|
2018-04-22 16:28:21 +01:00
|
|
|
|
"{correspondent} - "
|
|
|
|
|
|
"{title} - "
|
2020-11-20 16:18:59 +01:00
|
|
|
|
"{tags}.pdf"
|
2018-04-22 16:28:21 +01:00
|
|
|
|
)
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
for created in self.valid_dates:
|
|
|
|
|
|
for correspondent in self.valid_correspondents:
|
|
|
|
|
|
for title in self.valid_titles:
|
|
|
|
|
|
for tags in self.valid_tags:
|
2020-11-20 16:18:59 +01:00
|
|
|
|
spec = {
|
|
|
|
|
|
"created": created,
|
|
|
|
|
|
"correspondent": correspondent,
|
|
|
|
|
|
"title": title,
|
|
|
|
|
|
"tags": tags,
|
|
|
|
|
|
}
|
|
|
|
|
|
self._test_guessed_attributes(
|
|
|
|
|
|
template.format(**spec), **spec)
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
def test_created_and_correspondent_and_title(self):
|
|
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
template = "{created} - {correspondent} - {title}.pdf"
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
for created in self.valid_dates:
|
|
|
|
|
|
for correspondent in self.valid_correspondents:
|
|
|
|
|
|
for title in self.valid_titles:
|
|
|
|
|
|
|
|
|
|
|
|
# Skip cases where title looks like a tag as we can't
|
|
|
|
|
|
# accommodate such cases.
|
|
|
|
|
|
if title.lower() == title:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
spec = {
|
|
|
|
|
|
"created": created,
|
|
|
|
|
|
"correspondent": correspondent,
|
|
|
|
|
|
"title": title
|
|
|
|
|
|
}
|
|
|
|
|
|
self._test_guessed_attributes(
|
|
|
|
|
|
template.format(**spec), **spec)
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
def test_created_and_title(self):
|
|
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
template = "{created} - {title}.pdf"
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
for created in self.valid_dates:
|
|
|
|
|
|
for title in self.valid_titles:
|
2020-11-20 16:18:59 +01:00
|
|
|
|
spec = {
|
|
|
|
|
|
"created": created,
|
|
|
|
|
|
"title": title
|
|
|
|
|
|
}
|
|
|
|
|
|
self._test_guessed_attributes(
|
|
|
|
|
|
template.format(**spec), **spec)
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
def test_created_and_title_and_tags(self):
|
|
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
template = "{created} - {title} - {tags}.pdf"
|
2016-03-24 19:18:33 +00:00
|
|
|
|
|
|
|
|
|
|
for created in self.valid_dates:
|
|
|
|
|
|
for title in self.valid_titles:
|
|
|
|
|
|
for tags in self.valid_tags:
|
2020-11-20 16:18:59 +01:00
|
|
|
|
spec = {
|
|
|
|
|
|
"created": created,
|
|
|
|
|
|
"title": title,
|
|
|
|
|
|
"tags": tags
|
|
|
|
|
|
}
|
|
|
|
|
|
self._test_guessed_attributes(
|
|
|
|
|
|
template.format(**spec), **spec)
|
2018-04-22 16:27:43 +01:00
|
|
|
|
|
|
|
|
|
|
def test_invalid_date_format(self):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
info = FileInfo.from_filename("06112017Z - title.pdf")
|
2018-04-22 16:27:43 +01:00
|
|
|
|
self.assertEqual(info.title, "title")
|
|
|
|
|
|
self.assertIsNone(info.created)
|
2019-09-08 20:24:58 +02:00
|
|
|
|
|
|
|
|
|
|
def test_filename_parse_transforms(self):
|
|
|
|
|
|
|
2020-11-20 16:18:59 +01:00
|
|
|
|
filename = "tag1,tag2_20190908_180610_0001.pdf"
|
2019-09-08 20:24:58 +02:00
|
|
|
|
all_patt = re.compile("^.*$")
|
|
|
|
|
|
none_patt = re.compile("$a")
|
|
|
|
|
|
exact_patt = re.compile("^([a-z0-9,]+)_(\\d{8})_(\\d{6})_([0-9]+)\\.")
|
|
|
|
|
|
repl1 = " - \\4 - \\1." # (empty) corrspondent, title and tags
|
|
|
|
|
|
repl2 = "\\2Z - " + repl1 # creation date + repl1
|
|
|
|
|
|
|
|
|
|
|
|
# No transformations configured (= default)
|
2020-11-20 16:18:59 +01:00
|
|
|
|
info = FileInfo.from_filename(filename)
|
2019-09-08 20:24:58 +02:00
|
|
|
|
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
|
|
|
|
|
self.assertEqual(info.tags, ())
|
|
|
|
|
|
self.assertIsNone(info.created)
|
|
|
|
|
|
|
|
|
|
|
|
# Pattern doesn't match (filename unaltered)
|
|
|
|
|
|
with self.settings(
|
|
|
|
|
|
FILENAME_PARSE_TRANSFORMS=[(none_patt, "none.gif")]):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
info = FileInfo.from_filename(filename)
|
2019-09-08 20:24:58 +02:00
|
|
|
|
self.assertEqual(info.title, "tag1,tag2_20190908_180610_0001")
|
|
|
|
|
|
|
|
|
|
|
|
# Simple transformation (match all)
|
|
|
|
|
|
with self.settings(
|
|
|
|
|
|
FILENAME_PARSE_TRANSFORMS=[(all_patt, "all.gif")]):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
info = FileInfo.from_filename(filename)
|
2019-09-08 20:24:58 +02:00
|
|
|
|
self.assertEqual(info.title, "all")
|
|
|
|
|
|
|
|
|
|
|
|
# Multiple transformations configured (first pattern matches)
|
|
|
|
|
|
with self.settings(
|
|
|
|
|
|
FILENAME_PARSE_TRANSFORMS=[
|
|
|
|
|
|
(all_patt, "all.gif"),
|
|
|
|
|
|
(all_patt, "anotherall.gif")]):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
info = FileInfo.from_filename(filename)
|
2019-09-08 20:24:58 +02:00
|
|
|
|
self.assertEqual(info.title, "all")
|
|
|
|
|
|
|
|
|
|
|
|
# Multiple transformations configured (second pattern matches)
|
|
|
|
|
|
with self.settings(
|
|
|
|
|
|
FILENAME_PARSE_TRANSFORMS=[
|
|
|
|
|
|
(none_patt, "none.gif"),
|
|
|
|
|
|
(all_patt, "anotherall.gif")]):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
info = FileInfo.from_filename(filename)
|
2019-09-08 20:24:58 +02:00
|
|
|
|
self.assertEqual(info.title, "anotherall")
|
|
|
|
|
|
|
|
|
|
|
|
# Complex transformation without date in replacement string
|
|
|
|
|
|
with self.settings(
|
|
|
|
|
|
FILENAME_PARSE_TRANSFORMS=[(exact_patt, repl1)]):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
info = FileInfo.from_filename(filename)
|
2019-09-08 20:24:58 +02:00
|
|
|
|
self.assertEqual(info.title, "0001")
|
|
|
|
|
|
self.assertEqual(len(info.tags), 2)
|
|
|
|
|
|
self.assertEqual(info.tags[0].slug, "tag1")
|
|
|
|
|
|
self.assertEqual(info.tags[1].slug, "tag2")
|
|
|
|
|
|
self.assertIsNone(info.created)
|
|
|
|
|
|
|
|
|
|
|
|
# Complex transformation with date in replacement string
|
|
|
|
|
|
with self.settings(
|
|
|
|
|
|
FILENAME_PARSE_TRANSFORMS=[
|
|
|
|
|
|
(none_patt, "none.gif"),
|
|
|
|
|
|
(exact_patt, repl2), # <-- matches
|
|
|
|
|
|
(exact_patt, repl1),
|
|
|
|
|
|
(all_patt, "all.gif")]):
|
2020-11-20 16:18:59 +01:00
|
|
|
|
info = FileInfo.from_filename(filename)
|
2019-09-08 20:24:58 +02:00
|
|
|
|
self.assertEqual(info.title, "0001")
|
|
|
|
|
|
self.assertEqual(len(info.tags), 2)
|
|
|
|
|
|
self.assertEqual(info.tags[0].slug, "tag1")
|
|
|
|
|
|
self.assertEqual(info.tags[1].slug, "tag2")
|
|
|
|
|
|
self.assertEqual(info.created.year, 2019)
|
|
|
|
|
|
self.assertEqual(info.created.month, 9)
|
|
|
|
|
|
self.assertEqual(info.created.day, 8)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DummyParser(DocumentParser):
|
|
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def get_thumbnail(self, document_path, mime_type):
|
2020-11-16 23:16:37 +01:00
|
|
|
|
# not important during tests
|
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def __init__(self, logging_group, scratch_dir):
|
|
|
|
|
|
super(DummyParser, self).__init__(logging_group)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
|
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def get_optimised_thumbnail(self, document_path, mime_type):
|
2020-11-16 23:16:37 +01:00
|
|
|
|
return self.fake_thumb
|
|
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def parse(self, document_path, mime_type):
|
|
|
|
|
|
self.text = "The Text"
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class FaultyParser(DocumentParser):
|
|
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def get_thumbnail(self, document_path, mime_type):
|
2020-11-16 23:16:37 +01:00
|
|
|
|
# not important during tests
|
|
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def __init__(self, logging_group, scratch_dir):
|
|
|
|
|
|
super(FaultyParser, self).__init__(logging_group)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
_, self.fake_thumb = tempfile.mkstemp(suffix=".png", dir=scratch_dir)
|
|
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def get_optimised_thumbnail(self, document_path, mime_type):
|
2020-11-16 23:16:37 +01:00
|
|
|
|
return self.fake_thumb
|
|
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def parse(self, document_path, mime_type):
|
2020-11-16 23:16:37 +01:00
|
|
|
|
raise ParseError("Does not compute.")
|
|
|
|
|
|
|
|
|
|
|
|
|
2020-11-20 13:31:03 +01:00
|
|
|
|
def fake_magic_from_file(file, mime=False):
|
|
|
|
|
|
|
|
|
|
|
|
if mime:
|
|
|
|
|
|
if os.path.splitext(file)[1] == ".pdf":
|
|
|
|
|
|
return "application/pdf"
|
|
|
|
|
|
else:
|
|
|
|
|
|
return "unknown"
|
|
|
|
|
|
else:
|
|
|
|
|
|
return "A verbose string that describes the contents of the file"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
class TestConsumer(TestCase):
|
|
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def make_dummy_parser(self, logging_group):
|
|
|
|
|
|
return DummyParser(logging_group, self.scratch_dir)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
2020-11-25 19:51:09 +01:00
|
|
|
|
def make_faulty_parser(self, logging_group):
|
|
|
|
|
|
return FaultyParser(logging_group, self.scratch_dir)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
|
|
|
|
|
def setUp(self):
|
|
|
|
|
|
self.scratch_dir = tempfile.mkdtemp()
|
|
|
|
|
|
self.media_dir = tempfile.mkdtemp()
|
2020-11-17 18:35:45 +01:00
|
|
|
|
self.consumption_dir = tempfile.mkdtemp()
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
|
|
|
|
|
override_settings(
|
|
|
|
|
|
SCRATCH_DIR=self.scratch_dir,
|
|
|
|
|
|
MEDIA_ROOT=self.media_dir,
|
|
|
|
|
|
ORIGINALS_DIR=os.path.join(self.media_dir, "documents", "originals"),
|
2020-11-17 18:35:45 +01:00
|
|
|
|
THUMBNAIL_DIR=os.path.join(self.media_dir, "documents", "thumbnails"),
|
|
|
|
|
|
CONSUMPTION_DIR=self.consumption_dir
|
2020-11-16 23:16:37 +01:00
|
|
|
|
).enable()
|
|
|
|
|
|
|
|
|
|
|
|
patcher = mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
|
|
|
|
m = patcher.start()
|
|
|
|
|
|
m.return_value = [(None, {
|
|
|
|
|
|
"parser": self.make_dummy_parser,
|
2020-11-20 13:31:03 +01:00
|
|
|
|
"mime_types": ["application/pdf"],
|
2020-11-16 23:16:37 +01:00
|
|
|
|
"weight": 0
|
|
|
|
|
|
})]
|
|
|
|
|
|
|
|
|
|
|
|
self.addCleanup(patcher.stop)
|
|
|
|
|
|
|
|
|
|
|
|
self.consumer = Consumer()
|
|
|
|
|
|
|
|
|
|
|
|
def tearDown(self):
|
|
|
|
|
|
shutil.rmtree(self.scratch_dir, ignore_errors=True)
|
|
|
|
|
|
shutil.rmtree(self.media_dir, ignore_errors=True)
|
2020-11-17 18:35:45 +01:00
|
|
|
|
shutil.rmtree(self.consumption_dir, ignore_errors=True)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
|
|
|
|
|
def get_test_file(self):
|
|
|
|
|
|
fd, f = tempfile.mkstemp(suffix=".pdf", dir=self.scratch_dir)
|
|
|
|
|
|
return f
|
|
|
|
|
|
|
|
|
|
|
|
def testNormalOperation(self):
|
|
|
|
|
|
|
|
|
|
|
|
filename = self.get_test_file()
|
|
|
|
|
|
document = self.consumer.try_consume_file(filename)
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(document.content, "The Text")
|
|
|
|
|
|
self.assertEqual(document.title, os.path.splitext(os.path.basename(filename))[0])
|
|
|
|
|
|
self.assertIsNone(document.correspondent)
|
|
|
|
|
|
self.assertIsNone(document.document_type)
|
|
|
|
|
|
self.assertEqual(document.filename, "0000001.pdf")
|
|
|
|
|
|
|
|
|
|
|
|
self.assertTrue(os.path.isfile(
|
|
|
|
|
|
document.source_path
|
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
self.assertTrue(os.path.isfile(
|
|
|
|
|
|
document.thumbnail_path
|
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
self.assertFalse(os.path.isfile(filename))
|
|
|
|
|
|
|
|
|
|
|
|
def testOverrideFilename(self):
|
|
|
|
|
|
filename = self.get_test_file()
|
2020-11-18 22:41:14 +01:00
|
|
|
|
override_filename = "My Bank - Statement for November.pdf"
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
2020-11-18 22:41:14 +01:00
|
|
|
|
document = self.consumer.try_consume_file(filename, override_filename=override_filename)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
|
|
|
|
|
self.assertEqual(document.correspondent.name, "My Bank")
|
|
|
|
|
|
self.assertEqual(document.title, "Statement for November")
|
|
|
|
|
|
|
|
|
|
|
|
def testOverrideTitle(self):
|
|
|
|
|
|
|
2020-11-17 11:49:44 +01:00
|
|
|
|
document = self.consumer.try_consume_file(self.get_test_file(), override_title="Override Title")
|
2020-11-16 23:16:37 +01:00
|
|
|
|
self.assertEqual(document.title, "Override Title")
|
|
|
|
|
|
|
|
|
|
|
|
def testOverrideCorrespondent(self):
|
|
|
|
|
|
c = Correspondent.objects.create(name="test")
|
|
|
|
|
|
|
2020-11-17 11:49:44 +01:00
|
|
|
|
document = self.consumer.try_consume_file(self.get_test_file(), override_correspondent_id=c.pk)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
self.assertEqual(document.correspondent.id, c.id)
|
|
|
|
|
|
|
|
|
|
|
|
def testOverrideDocumentType(self):
|
|
|
|
|
|
dt = DocumentType.objects.create(name="test")
|
|
|
|
|
|
|
2020-11-17 11:49:44 +01:00
|
|
|
|
document = self.consumer.try_consume_file(self.get_test_file(), override_document_type_id=dt.pk)
|
2020-11-16 23:16:37 +01:00
|
|
|
|
self.assertEqual(document.document_type.id, dt.id)
|
|
|
|
|
|
|
|
|
|
|
|
def testOverrideTags(self):
|
|
|
|
|
|
t1 = Tag.objects.create(name="t1")
|
|
|
|
|
|
t2 = Tag.objects.create(name="t2")
|
|
|
|
|
|
t3 = Tag.objects.create(name="t3")
|
2020-11-17 11:49:44 +01:00
|
|
|
|
document = self.consumer.try_consume_file(self.get_test_file(), override_tag_ids=[t1.id, t3.id])
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
|
|
|
|
|
self.assertIn(t1, document.tags.all())
|
|
|
|
|
|
self.assertNotIn(t2, document.tags.all())
|
|
|
|
|
|
self.assertIn(t3, document.tags.all())
|
|
|
|
|
|
|
|
|
|
|
|
def testNotAFile(self):
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.consumer.try_consume_file("non-existing-file")
|
|
|
|
|
|
except ConsumerError as e:
|
|
|
|
|
|
self.assertTrue(str(e).endswith('It is not a file'))
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
self.fail("Should throw exception")
|
|
|
|
|
|
|
|
|
|
|
|
@override_settings(CONSUMPTION_DIR=None)
|
|
|
|
|
|
def testConsumptionDirUnset(self):
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.consumer.try_consume_file(self.get_test_file())
|
|
|
|
|
|
except ConsumerError as e:
|
|
|
|
|
|
self.assertEqual(str(e), "The CONSUMPTION_DIR settings variable does not appear to be set.")
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
self.fail("Should throw exception")
|
|
|
|
|
|
|
|
|
|
|
|
@override_settings(CONSUMPTION_DIR="asd")
|
|
|
|
|
|
def testNoConsumptionDir(self):
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.consumer.try_consume_file(self.get_test_file())
|
|
|
|
|
|
except ConsumerError as e:
|
|
|
|
|
|
self.assertEqual(str(e), "Consumption directory asd does not exist")
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
self.fail("Should throw exception")
|
|
|
|
|
|
|
|
|
|
|
|
def testDuplicates(self):
|
|
|
|
|
|
self.consumer.try_consume_file(self.get_test_file())
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.consumer.try_consume_file(self.get_test_file())
|
|
|
|
|
|
except ConsumerError as e:
|
|
|
|
|
|
self.assertTrue(str(e).endswith("It is a duplicate."))
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
self.fail("Should throw exception")
|
|
|
|
|
|
|
|
|
|
|
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
|
|
|
|
def testNoParsers(self, m):
|
|
|
|
|
|
m.return_value = []
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.consumer.try_consume_file(self.get_test_file())
|
|
|
|
|
|
except ConsumerError as e:
|
|
|
|
|
|
self.assertTrue(str(e).startswith("No parsers abvailable"))
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
self.fail("Should throw exception")
|
|
|
|
|
|
|
|
|
|
|
|
@mock.patch("documents.parsers.document_consumer_declaration.send")
|
|
|
|
|
|
def testFaultyParser(self, m):
|
|
|
|
|
|
m.return_value = [(None, {
|
|
|
|
|
|
"parser": self.make_faulty_parser,
|
2020-11-20 13:31:03 +01:00
|
|
|
|
"mime_types": ["application/pdf"],
|
2020-11-16 23:16:37 +01:00
|
|
|
|
"weight": 0
|
|
|
|
|
|
})]
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.consumer.try_consume_file(self.get_test_file())
|
|
|
|
|
|
except ConsumerError as e:
|
|
|
|
|
|
self.assertEqual(str(e), "Does not compute.")
|
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
self.fail("Should throw exception.")
|
|
|
|
|
|
|
|
|
|
|
|
@mock.patch("documents.consumer.Consumer._write")
|
|
|
|
|
|
def testPostSaveError(self, m):
|
|
|
|
|
|
filename = self.get_test_file()
|
|
|
|
|
|
m.side_effect = OSError("NO.")
|
|
|
|
|
|
try:
|
|
|
|
|
|
self.consumer.try_consume_file(filename)
|
|
|
|
|
|
except ConsumerError as e:
|
|
|
|
|
|
self.assertEqual(str(e), "NO.")
|
|
|
|
|
|
else:
|
|
|
|
|
|
self.fail("Should raise exception")
|
|
|
|
|
|
|
|
|
|
|
|
# file not deleted
|
|
|
|
|
|
self.assertTrue(os.path.isfile(filename))
|
|
|
|
|
|
|
|
|
|
|
|
# Database empty
|
|
|
|
|
|
self.assertEqual(len(Document.objects.all()), 0)
|
|
|
|
|
|
|
|
|
|
|
|
@override_settings(PAPERLESS_FILENAME_FORMAT="{correspondent}/{title}")
|
|
|
|
|
|
def testFilenameHandling(self):
|
|
|
|
|
|
filename = self.get_test_file()
|
|
|
|
|
|
|
2020-11-17 11:49:44 +01:00
|
|
|
|
document = self.consumer.try_consume_file(filename, override_filename="Bank - Test.pdf", override_title="new docs")
|
2020-11-16 23:16:37 +01:00
|
|
|
|
|
|
|
|
|
|
print(document.source_path)
|
|
|
|
|
|
print("===")
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(document.title, "new docs")
|
|
|
|
|
|
self.assertEqual(document.correspondent.name, "Bank")
|
|
|
|
|
|
self.assertEqual(document.filename, "bank/new-docs-0000001.pdf")
|
|
|
|
|
|
|
|
|
|
|
|
@mock.patch("documents.consumer.DocumentClassifier")
|
|
|
|
|
|
def testClassifyDocument(self, m):
|
|
|
|
|
|
correspondent = Correspondent.objects.create(name="test")
|
|
|
|
|
|
dtype = DocumentType.objects.create(name="test")
|
|
|
|
|
|
t1 = Tag.objects.create(name="t1")
|
|
|
|
|
|
t2 = Tag.objects.create(name="t2")
|
|
|
|
|
|
|
|
|
|
|
|
m.return_value = MagicMock()
|
|
|
|
|
|
m.return_value.predict_correspondent.return_value = correspondent.pk
|
|
|
|
|
|
m.return_value.predict_document_type.return_value = dtype.pk
|
|
|
|
|
|
m.return_value.predict_tags.return_value = [t1.pk]
|
|
|
|
|
|
|
|
|
|
|
|
document = self.consumer.try_consume_file(self.get_test_file())
|
|
|
|
|
|
|
|
|
|
|
|
self.assertEqual(document.correspondent, correspondent)
|
|
|
|
|
|
self.assertEqual(document.document_type, dtype)
|
|
|
|
|
|
self.assertIn(t1, document.tags.all())
|
|
|
|
|
|
self.assertNotIn(t2, document.tags.all())
|