2020-11-19 20:28:41 +01:00
import os
import shutil
import tempfile
import uuid
from typing import ContextManager
from unittest import mock
from django . test import TestCase , override_settings
from documents . parsers import ParseError , run_convert
2020-11-25 14:51:32 +01:00
from paperless_tesseract . parsers import RasterisedDocumentParser , get_text_from_pdf
2020-11-19 20:28:41 +01:00
image_to_string_calls = [ ]
def fake_convert ( input_file , output_file , * * kwargs ) :
with open ( input_file ) as f :
lines = f . readlines ( )
for i , line in enumerate ( lines ) :
with open ( output_file % i , " w " ) as f2 :
f2 . write ( line . strip ( ) )
class FakeImageFile ( ContextManager ) :
def __init__ ( self , fname ) :
self . fname = fname
def __exit__ ( self , exc_type , exc_val , exc_tb ) :
pass
def __enter__ ( self ) :
return os . path . basename ( self . fname )
2020-11-26 00:08:23 +01:00
class TestParser ( TestCase ) :
2020-11-19 20:28:41 +01:00
def setUp ( self ) :
self . scratch = tempfile . mkdtemp ( )
override_settings ( SCRATCH_DIR = self . scratch ) . enable ( )
def tearDown ( self ) :
shutil . rmtree ( self . scratch )
SAMPLE_FILES = os . path . join ( os . path . dirname ( __file__ ) , " samples " )
def test_get_text_from_pdf ( self ) :
2020-11-26 00:08:23 +01:00
text = get_text_from_pdf ( os . path . join ( self . SAMPLE_FILES , ' simple-digital.pdf ' ) )
2020-11-19 20:28:41 +01:00
self . assertEqual ( text . strip ( ) , " This is a test document. " )
def test_thumbnail ( self ) :
2020-11-25 19:51:09 +01:00
parser = RasterisedDocumentParser ( uuid . uuid4 ( ) )
2020-11-26 00:08:23 +01:00
parser . get_thumbnail ( os . path . join ( self . SAMPLE_FILES , ' simple-digital.pdf ' ) , " application/pdf " )
2020-11-19 20:28:41 +01:00
# dont really know how to test it, just call it and assert that it does not raise anything.
@mock.patch ( " paperless_tesseract.parsers.run_convert " )
def test_thumbnail_fallback ( self , m ) :
def call_convert ( input_file , output_file , * * kwargs ) :
if " .pdf " in input_file :
raise ParseError ( " Does not compute. " )
else :
run_convert ( input_file = input_file , output_file = output_file , * * kwargs )
m . side_effect = call_convert
2020-11-25 19:51:09 +01:00
parser = RasterisedDocumentParser ( uuid . uuid4 ( ) )
2020-11-26 00:08:23 +01:00
parser . get_thumbnail ( os . path . join ( self . SAMPLE_FILES , ' simple-digital.pdf ' ) , " application/pdf " )
2020-11-19 20:28:41 +01:00
# dont really know how to test it, just call it and assert that it does not raise anything.
2020-11-26 00:08:23 +01:00
def test_get_dpi ( self ) :
parser = RasterisedDocumentParser ( None )
dpi = parser . get_dpi ( os . path . join ( self . SAMPLE_FILES , " simple-no-dpi.png " ) )
self . assertEqual ( dpi , None )
dpi = parser . get_dpi ( os . path . join ( self . SAMPLE_FILES , " simple.png " ) )
self . assertEqual ( dpi , 72 )
def test_simple_digital ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a test document. " )
def test_with_form ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " with-form.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " Please enter your name in here: \n \n This is a PDF document with a form. " )
@override_settings ( OCR_MODE = " redo " )
def test_with_form_error ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " with-form.pdf " ) , " application/pdf " )
self . assertIsNone ( parser . archive_path )
self . assertEqual ( parser . get_text ( ) , " Please enter your name in here: \n \n This is a PDF document with a form. " )
@override_settings ( OCR_MODE = " redo " )
@mock.patch ( " paperless_tesseract.parsers.get_text_from_pdf " , lambda _ : None )
def test_with_form_error_notext ( self ) :
parser = RasterisedDocumentParser ( None )
def f ( ) :
parser . parse ( os . path . join ( self . SAMPLE_FILES , " with-form.pdf " ) , " application/pdf " )
self . assertRaises ( ParseError , f )
@override_settings ( OCR_MODE = " force " )
def test_with_form_force ( self ) :
parser = RasterisedDocumentParser ( None )
# parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf")
# self.assertEqual(parser.get_text(), "Please enter your name in here:\n\nThis is a PDF document with a form.")
def test_image_simple ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple.png " ) , " image/png " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a test document. " )
def test_image_simple_alpha_fail ( self ) :
parser = RasterisedDocumentParser ( None )
def f ( ) :
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple-alpha.png " ) , " image/png " )
self . assertRaises ( ParseError , f )
def test_image_no_dpi_fail ( self ) :
parser = RasterisedDocumentParser ( None )
def f ( ) :
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple-no-dpi.png " ) , " image/png " )
self . assertRaises ( ParseError , f )
@override_settings ( OCR_IMAGE_DPI = 72 )
def test_image_no_dpi_default ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple-no-dpi.png " ) , " image/png " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a test document. " )
def test_multi_page ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a multi page document. Page 1. \n \n This is a multi page document. Page 2. \n \n This is a multi page document. Page 3. " )
@override_settings ( OCR_PAGES = 2 , OCR_MODE = " skip " )
def test_multi_page_pages_skip ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a multi page document. Page 1. \n \n This is a multi page document. Page 2. \n \n This is a multi page document. Page 3. " )
@override_settings ( OCR_PAGES = 2 , OCR_MODE = " redo " )
def test_multi_page_pages_redo ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a multi page document. Page 1. \n \n This is a multi page document. Page 2. \n \n This is a multi page document. Page 3. " )
@override_settings ( OCR_PAGES = 2 , OCR_MODE = " force " )
def test_multi_page_pages_force ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a multi page document. Page 1. \n \n This is a multi page document. Page 2. \n \n This is a multi page document. Page 3. " )
@override_settings ( OOCR_MODE = " skip " )
def test_multi_page_analog_pages_skip ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-images.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a multi page document. Page 1. \n \n This is a multi page document. Page 2. \n \n This is a multi page document. Page 3. " )
@override_settings ( OCR_PAGES = 2 , OCR_MODE = " redo " )
def test_multi_page_analog_pages_redo ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-images.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a multi page document. Page 1. \n \n This is a multi page document. Page 2. " )
@override_settings ( OCR_PAGES = 1 , OCR_MODE = " force " )
def test_multi_page_analog_pages_force ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-images.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertEqual ( parser . get_text ( ) , " This is a multi page document. Page 1. " )