2022-10-15 15:41:43 +02:00
import datetime
2022-10-13 01:03:09 +02:00
import os
2022-10-14 15:43:43 +02:00
from unittest import mock
2022-10-13 01:03:09 +02:00
2022-10-14 15:43:43 +02:00
import pytest
2022-10-13 01:03:09 +02:00
from django . test import TestCase
2022-10-14 15:43:43 +02:00
from documents . parsers import ParseError
2022-10-13 01:03:09 +02:00
from paperless_mail . parsers import MailDocumentParser
class TestParser ( TestCase ) :
SAMPLE_FILES = os . path . join ( os . path . dirname ( __file__ ) , " samples " )
2022-11-03 00:58:36 +01:00
def setUp ( self ) - > None :
self . parser = MailDocumentParser ( logging_group = None )
def tearDown ( self ) - > None :
self . parser . cleanup ( )
2022-10-14 15:43:43 +02:00
2022-11-03 00:58:36 +01:00
def test_get_parsed ( self ) :
2022-10-14 15:43:43 +02:00
# Check if exception is raised when parsing fails.
with pytest . raises ( ParseError ) :
2022-11-03 00:58:36 +01:00
self . parser . get_parsed ( os . path . join ( self . SAMPLE_FILES , " na " ) )
2022-10-22 02:25:23 +02:00
# Check if exception is raised when the mail is faulty.
with pytest . raises ( ParseError ) :
2022-11-03 00:58:36 +01:00
self . parser . get_parsed ( os . path . join ( self . SAMPLE_FILES , " broken.eml " ) )
2022-10-14 15:43:43 +02:00
# Parse Test file and check relevant content
2022-11-03 00:58:36 +01:00
parsed1 = self . parser . get_parsed (
os . path . join ( self . SAMPLE_FILES , " simple_text.eml " ) ,
)
2022-10-14 15:43:43 +02:00
self . assertEqual ( parsed1 . date . year , 2022 )
self . assertEqual ( parsed1 . date . month , 10 )
self . assertEqual ( parsed1 . date . day , 12 )
self . assertEqual ( parsed1 . date . hour , 21 )
self . assertEqual ( parsed1 . date . minute , 40 )
self . assertEqual ( parsed1 . date . second , 43 )
self . assertEqual ( parsed1 . date . tzname ( ) , " UTC+02:00 " )
self . assertEqual ( parsed1 . from_ , " mail@someserver.de " )
self . assertEqual ( parsed1 . subject , " Simple Text Mail " )
self . assertEqual ( parsed1 . text , " This is just a simple Text Mail. \n " )
self . assertEqual ( parsed1 . to , ( " some@one.de " , ) )
# Check if same parsed object as before is returned, even if another file is given.
2022-11-03 00:58:36 +01:00
parsed2 = self . parser . get_parsed (
os . path . join ( os . path . join ( self . SAMPLE_FILES , " na " ) ) ,
)
2022-10-14 15:43:43 +02:00
self . assertEqual ( parsed1 , parsed2 )
2022-11-03 00:58:36 +01:00
@mock.patch ( " paperless_mail.parsers.MailDocumentParser.generate_pdf " )
2022-10-27 23:41:29 +02:00
@mock.patch ( " paperless_mail.parsers.make_thumbnail_from_pdf " )
2022-11-03 00:58:36 +01:00
def test_get_thumbnail (
self ,
mock_make_thumbnail_from_pdf : mock . MagicMock ,
mock_generate_pdf : mock . MagicMock ,
) :
mocked_return = " Passing the return value through.. "
mock_make_thumbnail_from_pdf . return_value = mocked_return
mock_generate_pdf . return_value = " Mocked return value.. "
thumb = self . parser . get_thumbnail (
2022-10-13 01:03:09 +02:00
os . path . join ( self . SAMPLE_FILES , " simple_text.eml " ) ,
" message/rfc822 " ,
)
2022-10-27 23:41:29 +02:00
self . assertEqual (
2022-11-03 00:58:36 +01:00
self . parser . archive_path ,
2022-10-27 23:41:29 +02:00
mock_make_thumbnail_from_pdf . call_args_list [ 0 ] . args [ 0 ] ,
2022-10-13 01:03:09 +02:00
)
self . assertEqual (
2022-11-03 00:58:36 +01:00
self . parser . tempdir ,
2022-10-27 23:41:29 +02:00
mock_make_thumbnail_from_pdf . call_args_list [ 0 ] . args [ 1 ] ,
2022-10-13 01:03:09 +02:00
)
2022-11-03 00:58:36 +01:00
self . assertEqual ( mocked_return , thumb )
2022-10-14 15:43:43 +02:00
@mock.patch ( " documents.loggers.LoggingMixin.log " )
def test_extract_metadata ( self , m : mock . MagicMock ) :
# Validate if warning is logged when parsing fails
2022-11-03 00:58:36 +01:00
self . assertEqual ( [ ] , self . parser . extract_metadata ( " na " , " message/rfc822 " ) )
2022-10-14 15:43:43 +02:00
self . assertEqual ( " warning " , m . call_args [ 0 ] [ 0 ] )
# Validate Metadata parsing returns the expected results
2022-11-03 00:58:36 +01:00
metadata = self . parser . extract_metadata (
2022-10-14 15:43:43 +02:00
os . path . join ( self . SAMPLE_FILES , " simple_text.eml " ) ,
" message/rfc822 " ,
)
self . assertTrue (
{ " namespace " : " " , " prefix " : " " , " key " : " attachments " , " value " : " " }
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " " ,
" key " : " date " ,
" value " : " 2022-10-12 21:40:43 UTC+02:00 " ,
}
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " content-language " ,
" value " : " en-US " ,
}
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " content-type " ,
" value " : " text/plain; charset=UTF-8; format=flowed " ,
}
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " date " ,
" value " : " Wed, 12 Oct 2022 21:40:43 +0200 " ,
}
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " delivered-to " ,
" value " : " mail@someserver.de " ,
}
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " from " ,
" value " : " Some One <mail@someserver.de> " ,
}
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " message-id " ,
" value " : " <6e99e34d-e20a-80c4-ea61-d8234b612be9@someserver.de> " ,
}
in metadata ,
)
self . assertTrue (
{ " namespace " : " " , " prefix " : " header " , " key " : " mime-version " , " value " : " 1.0 " }
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " received " ,
" value " : " from mail.someserver.org ([::1]) \n \t by e1acdba3bd07 with LMTP \n \t id KBKZGD2YR2NTCgQAjubtDA \n \t (envelope-from <mail@someserver.de>) \n \t for <mail@someserver.de>; Wed, 10 Oct 2022 11:40:46 +0200, from [127.0.0.1] (localhost [127.0.0.1]) by localhost (Mailerdaemon) with ESMTPSA id 2BC9064C1616 \n \t for <some@one.de>; Wed, 12 Oct 2022 21:40:46 +0200 (CEST) " ,
}
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " return-path " ,
" value " : " <mail@someserver.de> " ,
}
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " subject " ,
" value " : " Simple Text Mail " ,
}
in metadata ,
)
self . assertTrue (
{ " namespace " : " " , " prefix " : " header " , " key " : " to " , " value " : " some@one.de " }
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " user-agent " ,
" value " : " Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 \n Thunderbird/102.3.1 " ,
}
in metadata ,
)
self . assertTrue (
{
" namespace " : " " ,
" prefix " : " header " ,
" key " : " x-last-tls-session-version " ,
" value " : " TLSv1.3 " ,
}
in metadata ,
)
2022-10-15 13:13:29 +02:00
2022-11-03 00:58:36 +01:00
def test_parse_na ( self ) :
2022-10-15 15:41:43 +02:00
# Check if exception is raised when parsing fails.
with pytest . raises ( ParseError ) :
2022-11-03 00:58:36 +01:00
self . parser . parse (
2022-10-15 15:41:43 +02:00
os . path . join ( os . path . join ( self . SAMPLE_FILES , " na " ) ) ,
" message/rfc822 " ,
)
2022-11-13 22:33:26 +01:00
@mock.patch ( " paperless_mail.parsers.MailDocumentParser.tika_parse " )
2022-11-03 00:58:36 +01:00
@mock.patch ( " paperless_mail.parsers.MailDocumentParser.generate_pdf " )
@mock.patch ( " documents.loggers.LoggingMixin.log " ) # Disable log output
2022-11-13 22:33:26 +01:00
def test_parse_html_eml ( self , m , n , mock_tika_parse : mock . MagicMock ) :
2022-10-15 15:41:43 +02:00
# Validate parsing returns the expected results
2022-11-13 22:33:26 +01:00
text_expected = " Some Text \n and an embedded image. \n \n Subject: HTML Message \n \n From: Name <someone@example.de> \n \n To: someone@example.de \n \n Attachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (0.59 MiB) \n \n HTML content: tika return "
mock_tika_parse . return_value = " tika return "
2022-11-03 00:58:36 +01:00
self . parser . parse ( os . path . join ( self . SAMPLE_FILES , " html.eml " ) , " message/rfc822 " )
2022-10-15 15:41:43 +02:00
2022-11-03 00:58:36 +01:00
self . assertEqual ( text_expected , self . parser . text )
2022-10-15 15:41:43 +02:00
self . assertEqual (
datetime . datetime (
2022 ,
10 ,
15 ,
11 ,
23 ,
19 ,
tzinfo = datetime . timezone ( datetime . timedelta ( seconds = 7200 ) ) ,
) ,
2022-11-03 00:58:36 +01:00
self . parser . date ,
2022-10-15 15:41:43 +02:00
)
2022-11-03 00:58:36 +01:00
@mock.patch ( " paperless_mail.parsers.MailDocumentParser.generate_pdf " )
@mock.patch ( " documents.loggers.LoggingMixin.log " ) # Disable log output
def test_parse_simple_eml ( self , m , n ) :
2022-10-27 00:27:15 +02:00
# Validate parsing returns the expected results
2022-11-03 00:58:36 +01:00
self . parser . parse (
2022-10-27 00:27:15 +02:00
os . path . join ( self . SAMPLE_FILES , " simple_text.eml " ) ,
" message/rfc822 " ,
)
text_expected = " This is just a simple Text Mail. \n \n Subject: Simple Text Mail \n \n From: Some One <mail@someserver.de> \n \n To: some@one.de \n \n CC: asdasd@æsdasd.de, asdadasdasdasda.asdasd@æsdasd.de \n \n BCC: fdf@fvf.de \n \n "
2022-11-03 00:58:36 +01:00
self . assertEqual ( text_expected , self . parser . text )
2022-10-27 00:27:15 +02:00
self . assertEqual (
datetime . datetime (
2022 ,
10 ,
12 ,
21 ,
40 ,
43 ,
tzinfo = datetime . timezone ( datetime . timedelta ( seconds = 7200 ) ) ,
) ,
2022-11-03 00:58:36 +01:00
self . parser . date ,
2022-10-27 00:27:15 +02:00
)
2022-10-22 02:25:23 +02:00
# Just check if file exists, the unittest for generate_pdf() goes deeper.
2022-11-03 00:58:36 +01:00
self . assertTrue ( os . path . isfile ( self . parser . archive_path ) )
2022-10-15 15:41:43 +02:00
2022-11-13 22:33:26 +01:00
@mock.patch ( " paperless_mail.parsers.parser.from_buffer " )
2022-10-15 13:13:29 +02:00
@mock.patch ( " documents.loggers.LoggingMixin.log " ) # Disable log output
2022-11-13 22:33:26 +01:00
def test_tika_parse ( self , m , mock_from_buffer : mock . MagicMock ) :
2022-10-15 13:13:29 +02:00
html = ' <html><head><meta http-equiv= " content-type " content= " text/html; charset=UTF-8 " ></head><body><p>Some Text</p></body></html> '
2022-11-12 15:48:30 +01:00
expected_text = " Some Text "
2022-11-13 22:33:26 +01:00
mock_from_buffer . return_value = { " content " : expected_text }
2022-10-15 13:13:29 +02:00
# Check unsuccessful parsing
2022-11-13 22:33:26 +01:00
mock_from_buffer . return_value = { " content " : None }
2022-11-03 00:58:36 +01:00
parsed = self . parser . tika_parse ( None )
2022-10-15 13:13:29 +02:00
self . assertEqual ( " " , parsed )
# Check successful parsing
2022-11-13 22:33:26 +01:00
mock_from_buffer . return_value = { " content " : expected_text }
2022-11-03 00:58:36 +01:00
parsed = self . parser . tika_parse ( html )
2022-11-12 15:48:30 +01:00
self . assertEqual ( expected_text , parsed . strip ( ) )
2022-11-13 22:33:26 +01:00
mock_from_buffer . assert_called_with ( html , self . parser . tika_server )
# Check ParseError
def my_side_effect ( ) :
raise Exception ( " Test " )
mock_from_buffer . side_effect = my_side_effect
self . assertRaises ( ParseError , self . parser . tika_parse , html )
2022-10-18 23:48:07 +02:00
2022-10-27 00:27:15 +02:00
@mock.patch ( " paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail " )
@mock.patch ( " paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html " )
def test_generate_pdf_parse_error ( self , m : mock . MagicMock , n : mock . MagicMock ) :
m . return_value = b " "
n . return_value = b " "
# Check if exception is raised when the pdf can not be created.
2022-11-03 00:58:36 +01:00
self . parser . gotenberg_server = " "
2022-10-27 00:27:15 +02:00
with pytest . raises ( ParseError ) :
2022-11-03 00:58:36 +01:00
self . parser . generate_pdf ( os . path . join ( self . SAMPLE_FILES , " html.eml " ) )
2022-10-22 02:25:23 +02:00
2022-11-03 00:58:36 +01:00
@mock.patch ( " paperless_mail.parsers.requests.post " )
@mock.patch ( " paperless_mail.parsers.MailDocumentParser.generate_pdf_from_mail " )
@mock.patch ( " paperless_mail.parsers.MailDocumentParser.generate_pdf_from_html " )
def test_generate_pdf (
self ,
mock_generate_pdf_from_html : mock . MagicMock ,
mock_generate_pdf_from_mail : mock . MagicMock ,
mock_post : mock . MagicMock ,
) :
2022-10-22 02:25:23 +02:00
# Check if exception is raised when the mail can not be parsed.
with pytest . raises ( ParseError ) :
2022-11-03 00:58:36 +01:00
self . parser . generate_pdf ( os . path . join ( self . SAMPLE_FILES , " broken.eml " ) )
mock_generate_pdf_from_mail . return_value = b " Mail Return "
mock_generate_pdf_from_html . return_value = b " HTML Return "
2022-10-22 02:25:23 +02:00
2022-11-03 00:58:36 +01:00
mock_response = mock . MagicMock ( )
mock_response . content = b " Content "
mock_post . return_value = mock_response
pdf_path = self . parser . generate_pdf ( os . path . join ( self . SAMPLE_FILES , " html.eml " ) )
2022-10-22 02:25:23 +02:00
self . assertTrue ( os . path . isfile ( pdf_path ) )
2022-11-03 00:58:36 +01:00
mock_generate_pdf_from_mail . assert_called_once_with (
self . parser . get_parsed ( None ) ,
)
mock_generate_pdf_from_html . assert_called_once_with (
self . parser . get_parsed ( None ) . html ,
self . parser . get_parsed ( None ) . attachments ,
)
self . assertEqual (
self . parser . gotenberg_server + " /forms/pdfengines/merge " ,
mock_post . call_args . args [ 0 ] ,
)
self . assertEqual ( { } , mock_post . call_args . kwargs [ " headers " ] )
self . assertEqual (
b " Mail Return " ,
mock_post . call_args . kwargs [ " files " ] [ " 1_mail.pdf " ] [ 1 ] . read ( ) ,
)
self . assertEqual (
b " HTML Return " ,
mock_post . call_args . kwargs [ " files " ] [ " 2_html.pdf " ] [ 1 ] . read ( ) ,
)
mock_response . raise_for_status . assert_called_once ( )
with open ( pdf_path , " rb " ) as file :
self . assertEqual ( b " Content " , file . read ( ) )
2022-10-22 02:25:23 +02:00
2022-10-23 17:18:10 +02:00
def test_mail_to_html ( self ) :
2022-11-03 00:58:36 +01:00
mail = self . parser . get_parsed ( os . path . join ( self . SAMPLE_FILES , " html.eml " ) )
html_handle = self . parser . mail_to_html ( mail )
2022-10-23 17:18:10 +02:00
with open (
os . path . join ( self . SAMPLE_FILES , " html.eml.html " ) ,
) as html_expected_handle :
self . assertHTMLEqual ( html_expected_handle . read ( ) , html_handle . read ( ) )
2022-11-12 15:48:30 +01:00
@mock.patch ( " paperless_mail.parsers.requests.post " )
@mock.patch ( " paperless_mail.parsers.MailDocumentParser.mail_to_html " )
def test_generate_pdf_from_mail (
self ,
mock_mail_to_html : mock . MagicMock ,
mock_post : mock . MagicMock ,
) :
mock_response = mock . MagicMock ( )
mock_response . content = b " Content "
mock_post . return_value = mock_response
mock_mail_to_html . return_value = " Testresponse "
2022-11-03 00:58:36 +01:00
mail = self . parser . get_parsed ( os . path . join ( self . SAMPLE_FILES , " html.eml " ) )
2022-10-23 17:18:10 +02:00
2022-11-12 15:48:30 +01:00
retval = self . parser . generate_pdf_from_mail ( mail )
self . assertEqual ( b " Content " , retval )
2022-10-23 17:18:10 +02:00
2022-11-13 22:33:26 +01:00
mock_mail_to_html . assert_called_once_with ( mail )
2022-11-12 15:48:30 +01:00
self . assertEqual (
2022-11-13 22:33:26 +01:00
self . parser . gotenberg_server + " /forms/chromium/convert/html " ,
2022-11-12 15:48:30 +01:00
mock_post . call_args . args [ 0 ] ,
)
self . assertEqual ( { } , mock_post . call_args . kwargs [ " headers " ] )
self . assertEqual (
2022-11-13 22:33:26 +01:00
{
" marginTop " : " 0.1 " ,
" marginBottom " : " 0.1 " ,
" marginLeft " : " 0.1 " ,
" marginRight " : " 0.1 " ,
" paperWidth " : " 8.27 " ,
" paperHeight " : " 11.7 " ,
" scale " : " 1.0 " ,
} ,
mock_post . call_args . kwargs [ " data " ] ,
2022-11-12 15:48:30 +01:00
)
self . assertEqual (
2022-11-13 22:33:26 +01:00
" Testresponse " ,
mock_post . call_args . kwargs [ " files " ] [ " html " ] [ 1 ] ,
)
self . assertEqual (
" output.css " ,
mock_post . call_args . kwargs [ " files " ] [ " css " ] [ 0 ] ,
2022-11-12 15:48:30 +01:00
)
2022-10-23 17:18:10 +02:00
2022-11-12 15:48:30 +01:00
mock_response . raise_for_status . assert_called_once ( )
2022-10-23 17:18:10 +02:00
2022-10-18 23:48:07 +02:00
def test_transform_inline_html ( self ) :
class MailAttachmentMock :
def __init__ ( self , payload , content_id ) :
self . payload = payload
self . content_id = content_id
result = None
with open ( os . path . join ( self . SAMPLE_FILES , " sample.html " ) ) as html_file :
with open ( os . path . join ( self . SAMPLE_FILES , " sample.png " ) , " rb " ) as png_file :
html = html_file . read ( )
png = png_file . read ( )
attachments = [
MailAttachmentMock ( png , " part1.pNdUSz0s.D3NqVtPg@example.de " ) ,
]
2022-11-03 00:58:36 +01:00
result = self . parser . transform_inline_html ( html , attachments )
2022-10-18 23:48:07 +02:00
resulting_html = result [ - 1 ] [ 1 ] . read ( )
self . assertTrue ( result [ - 1 ] [ 0 ] == " index.html " )
self . assertTrue ( result [ 0 ] [ 0 ] in resulting_html )
self . assertFalse ( " <script " in resulting_html . lower ( ) )
2022-10-19 23:19:33 +02:00
2022-11-12 15:48:30 +01:00
@mock.patch ( " paperless_mail.parsers.requests.post " )
2022-10-19 23:19:33 +02:00
@mock.patch ( " documents.loggers.LoggingMixin.log " ) # Disable log output
2022-11-12 15:48:30 +01:00
def test_generate_pdf_from_html ( self , m , mock_post : mock . MagicMock ) :
2022-10-19 23:19:33 +02:00
class MailAttachmentMock :
def __init__ ( self , payload , content_id ) :
self . payload = payload
self . content_id = content_id
2022-11-12 15:48:30 +01:00
mock_response = mock . MagicMock ( )
mock_response . content = b " Content "
mock_post . return_value = mock_response
2022-10-19 23:19:33 +02:00
result = None
with open ( os . path . join ( self . SAMPLE_FILES , " sample.html " ) ) as html_file :
with open ( os . path . join ( self . SAMPLE_FILES , " sample.png " ) , " rb " ) as png_file :
html = html_file . read ( )
png = png_file . read ( )
attachments = [
MailAttachmentMock ( png , " part1.pNdUSz0s.D3NqVtPg@example.de " ) ,
]
2022-11-03 00:58:36 +01:00
result = self . parser . generate_pdf_from_html ( html , attachments )
2022-10-19 23:19:33 +02:00
2022-11-12 15:48:30 +01:00
self . assertEqual (
self . parser . gotenberg_server + " /forms/chromium/convert/html " ,
mock_post . call_args . args [ 0 ] ,
)
self . assertEqual ( { } , mock_post . call_args . kwargs [ " headers " ] )
self . assertEqual (
{
" marginTop " : " 0.1 " ,
" marginBottom " : " 0.1 " ,
" marginLeft " : " 0.1 " ,
" marginRight " : " 0.1 " ,
" paperWidth " : " 8.27 " ,
" paperHeight " : " 11.7 " ,
" scale " : " 1.0 " ,
} ,
mock_post . call_args . kwargs [ " data " ] ,
)
2022-10-19 23:19:33 +02:00
2022-11-12 15:48:30 +01:00
# read to assert it is a file like object.
mock_post . call_args . kwargs [ " files " ] [ " cidpart1pNdUSz0sD3NqVtPgexamplede " ] [
1
] . read ( )
mock_post . call_args . kwargs [ " files " ] [ " index.html " ] [ 1 ] . read ( )
mock_response . raise_for_status . assert_called_once ( )
2022-10-19 23:19:33 +02:00
2022-11-12 15:48:30 +01:00
self . assertEqual ( b " Content " , result )