2020-11-19 20:28:41 +01:00
import os
import uuid
from typing import ContextManager
from unittest import mock
from django . test import TestCase , override_settings
from documents . parsers import ParseError , run_convert
2020-11-27 14:06:37 +01:00
from documents . tests . utils import DirectoriesMixin
2021-02-21 00:18:34 +01:00
from paperless_tesseract . parsers import RasterisedDocumentParser , strip_excess_whitespace
2020-11-19 20:28:41 +01:00
image_to_string_calls = [ ]
def fake_convert ( input_file , output_file , * * kwargs ) :
with open ( input_file ) as f :
lines = f . readlines ( )
for i , line in enumerate ( lines ) :
with open ( output_file % i , " w " ) as f2 :
f2 . write ( line . strip ( ) )
class FakeImageFile ( ContextManager ) :
def __init__ ( self , fname ) :
self . fname = fname
def __exit__ ( self , exc_type , exc_val , exc_tb ) :
pass
def __enter__ ( self ) :
return os . path . basename ( self . fname )
2020-11-29 19:58:48 +01:00
2020-11-27 14:06:37 +01:00
class TestParser ( DirectoriesMixin , TestCase ) :
2020-11-29 19:58:48 +01:00
def assertContainsStrings ( self , content , strings ) :
# Asserts that all strings appear in content, in the given order.
2021-02-21 00:18:34 +01:00
indices = [ ]
for s in strings :
if s in content :
indices . append ( content . index ( s ) )
else :
self . fail ( f " ' { s } ' is not in ' { content } ' " )
2020-11-29 19:58:48 +01:00
self . assertListEqual ( indices , sorted ( indices ) )
2020-11-27 14:06:37 +01:00
text_cases = [
( " simple string " , " simple string " ) ,
(
" simple newline \n testing string " ,
" simple newline \n testing string "
) ,
(
" utf-8 строка с пробелами в конце " ,
" utf-8 строка с пробелами в конце "
)
]
def test_strip_excess_whitespace ( self ) :
for source , result in self . text_cases :
actual_result = strip_excess_whitespace ( source )
self . assertEqual (
result ,
actual_result ,
" strip_exceess_whitespace( {} ) != ' {} ' , but ' {} ' " . format (
source ,
result ,
actual_result
)
)
2020-11-19 20:28:41 +01:00
SAMPLE_FILES = os . path . join ( os . path . dirname ( __file__ ) , " samples " )
def test_get_text_from_pdf ( self ) :
2021-02-21 00:18:34 +01:00
parser = RasterisedDocumentParser ( uuid . uuid4 ( ) )
text = parser . extract_text ( None , os . path . join ( self . SAMPLE_FILES , ' simple-digital.pdf ' ) )
2020-11-19 20:28:41 +01:00
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( text . strip ( ) , [ " This is a test document. " ] )
2020-11-19 20:28:41 +01:00
def test_thumbnail ( self ) :
2020-11-25 19:51:09 +01:00
parser = RasterisedDocumentParser ( uuid . uuid4 ( ) )
2020-11-26 00:08:23 +01:00
parser . get_thumbnail ( os . path . join ( self . SAMPLE_FILES , ' simple-digital.pdf ' ) , " application/pdf " )
2020-11-19 20:28:41 +01:00
# dont really know how to test it, just call it and assert that it does not raise anything.
2021-01-02 15:37:27 +01:00
@mock.patch ( " documents.parsers.run_convert " )
2020-11-19 20:28:41 +01:00
def test_thumbnail_fallback ( self , m ) :
def call_convert ( input_file , output_file , * * kwargs ) :
if " .pdf " in input_file :
raise ParseError ( " Does not compute. " )
else :
run_convert ( input_file = input_file , output_file = output_file , * * kwargs )
m . side_effect = call_convert
2020-11-25 19:51:09 +01:00
parser = RasterisedDocumentParser ( uuid . uuid4 ( ) )
2020-11-26 00:08:23 +01:00
parser . get_thumbnail ( os . path . join ( self . SAMPLE_FILES , ' simple-digital.pdf ' ) , " application/pdf " )
2020-11-19 20:28:41 +01:00
# dont really know how to test it, just call it and assert that it does not raise anything.
2020-11-26 00:08:23 +01:00
def test_get_dpi ( self ) :
parser = RasterisedDocumentParser ( None )
dpi = parser . get_dpi ( os . path . join ( self . SAMPLE_FILES , " simple-no-dpi.png " ) )
self . assertEqual ( dpi , None )
dpi = parser . get_dpi ( os . path . join ( self . SAMPLE_FILES , " simple.png " ) )
self . assertEqual ( dpi , 72 )
def test_simple_digital ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) , [ " This is a test document. " ] )
2020-11-26 00:08:23 +01:00
def test_with_form ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " with-form.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) , [ " Please enter your name in here: " , " This is a PDF document with a form. " ] )
2020-11-26 00:08:23 +01:00
@override_settings ( OCR_MODE = " redo " )
def test_with_form_error ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " with-form.pdf " ) , " application/pdf " )
self . assertIsNone ( parser . archive_path )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) , [ " Please enter your name in here: " , " This is a PDF document with a form. " ] )
2020-11-26 00:08:23 +01:00
2021-02-21 00:18:34 +01:00
@override_settings ( OCR_MODE = " skip " )
def test_encrypted ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " encrypted.pdf " ) , " application/pdf " )
self . assertIsNone ( parser . archive_path )
self . assertContainsStrings ( parser . get_text ( ) , [ " This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable " , " automated testing of signed/encrypted PDFs " ] )
2020-11-26 00:08:23 +01:00
@override_settings ( OCR_MODE = " redo " )
def test_with_form_error_notext ( self ) :
parser = RasterisedDocumentParser ( None )
2021-02-21 00:18:34 +01:00
parser . parse ( os . path . join ( self . SAMPLE_FILES , " with-form.pdf " ) , " application/pdf " )
2020-11-26 00:08:23 +01:00
2021-02-21 00:18:34 +01:00
self . assertContainsStrings ( parser . get_text ( ) , [ " Please enter your name in here: " , " This is a PDF document with a form. " ] )
2020-11-26 00:08:23 +01:00
@override_settings ( OCR_MODE = " force " )
def test_with_form_force ( self ) :
parser = RasterisedDocumentParser ( None )
2020-11-29 19:58:48 +01:00
parser . parse ( os . path . join ( self . SAMPLE_FILES , " with-form.pdf " ) , " application/pdf " )
2020-11-26 00:08:23 +01:00
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) , [ " Please enter your name in here: " , " This is a PDF document with a form. " ] )
2020-11-26 00:08:23 +01:00
def test_image_simple ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple.png " ) , " image/png " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) , [ " This is a test document. " ] )
2020-11-26 00:08:23 +01:00
def test_image_simple_alpha_fail ( self ) :
parser = RasterisedDocumentParser ( None )
def f ( ) :
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple-alpha.png " ) , " image/png " )
self . assertRaises ( ParseError , f )
2021-02-15 12:18:10 +01:00
def test_image_calc_a4_dpi ( self ) :
2020-12-12 18:25:15 +01:00
parser = RasterisedDocumentParser ( None )
2021-02-15 12:18:10 +01:00
dpi = parser . calculate_a4_dpi ( os . path . join ( self . SAMPLE_FILES , " simple-no-dpi.png " ) )
2020-12-12 18:25:15 +01:00
2021-02-15 12:18:10 +01:00
self . assertEqual ( dpi , 62 )
2020-11-26 00:08:23 +01:00
2020-12-12 18:25:15 +01:00
@mock.patch ( " paperless_tesseract.parsers.RasterisedDocumentParser.calculate_a4_dpi " )
def test_image_dpi_fail ( self , m ) :
m . return_value = None
2020-11-26 00:08:23 +01:00
parser = RasterisedDocumentParser ( None )
def f ( ) :
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple-no-dpi.png " ) , " image/png " )
self . assertRaises ( ParseError , f )
@override_settings ( OCR_IMAGE_DPI = 72 )
def test_image_no_dpi_default ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple-no-dpi.png " ) , " image/png " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " this is a test document. " ] )
2020-11-26 00:08:23 +01:00
def test_multi_page ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " , " page 2 " , " page 3 " ] )
2020-11-26 00:08:23 +01:00
@override_settings ( OCR_PAGES = 2 , OCR_MODE = " skip " )
def test_multi_page_pages_skip ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " , " page 2 " , " page 3 " ] )
2020-11-26 00:08:23 +01:00
@override_settings ( OCR_PAGES = 2 , OCR_MODE = " redo " )
def test_multi_page_pages_redo ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " , " page 2 " , " page 3 " ] )
2020-11-26 00:08:23 +01:00
@override_settings ( OCR_PAGES = 2 , OCR_MODE = " force " )
def test_multi_page_pages_force ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-digital.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " , " page 2 " , " page 3 " ] )
2020-11-26 00:08:23 +01:00
@override_settings ( OOCR_MODE = " skip " )
def test_multi_page_analog_pages_skip ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-images.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " , " page 2 " , " page 3 " ] )
2020-11-26 00:08:23 +01:00
@override_settings ( OCR_PAGES = 2 , OCR_MODE = " redo " )
def test_multi_page_analog_pages_redo ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-images.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " , " page 2 " ] )
self . assertFalse ( " page 3 " in parser . get_text ( ) . lower ( ) )
2020-11-26 00:08:23 +01:00
@override_settings ( OCR_PAGES = 1 , OCR_MODE = " force " )
def test_multi_page_analog_pages_force ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-images.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-11-29 19:58:48 +01:00
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " ] )
self . assertFalse ( " page 2 " in parser . get_text ( ) . lower ( ) )
self . assertFalse ( " page 3 " in parser . get_text ( ) . lower ( ) )
2020-12-01 14:30:13 +01:00
@override_settings ( OCR_MODE = " skip_noarchive " )
def test_skip_noarchive_withtext ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-digital.pdf " ) , " application/pdf " )
self . assertIsNone ( parser . archive_path )
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " , " page 2 " , " page 3 " ] )
@override_settings ( OCR_MODE = " skip_noarchive " )
def test_skip_noarchive_notext ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-images.pdf " ) , " application/pdf " )
2021-02-21 00:18:34 +01:00
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
2020-12-01 14:30:13 +01:00
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " , " page 2 " , " page 3 " ] )
2020-12-02 17:39:49 +01:00
2021-02-21 00:18:34 +01:00
@override_settings ( OCR_MODE = " skip " )
def test_multi_page_mixed ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-mixed.pdf " ) , " application/pdf " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 1 " , " page 2 " , " page 3 " , " page 4 " , " page 5 " , " page 6 " ] )
with open ( os . path . join ( parser . tempdir , " sidecar.txt " ) ) as f :
sidecar = f . read ( )
2021-02-28 13:01:26 +01:00
self . assertIn ( " [OCR skipped on page(s) 4-6] " , sidecar )
2021-02-21 00:18:34 +01:00
@override_settings ( OCR_MODE = " skip_noarchive " )
def test_multi_page_mixed_no_archive ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " multi-page-mixed.pdf " ) , " application/pdf " )
self . assertIsNone ( parser . archive_path )
self . assertContainsStrings ( parser . get_text ( ) . lower ( ) , [ " page 4 " , " page 5 " , " page 6 " ] )
@override_settings ( OCR_MODE = " skip " , OCR_ROTATE_PAGES = True )
def test_rotate ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " rotated.pdf " ) , " application/pdf " )
self . assertContainsStrings ( parser . get_text ( ) , [
" This is the text that appears on the first page. It’ s a lot of text. " ,
" Even if the pages are rotated, OCRmyPDF still gets the job done. " ,
" This is a really weird file with lots of nonsense text. " ,
" If you read this, it’ s your own fault. Also check your screen orientation. "
] )
2021-02-22 00:17:16 +01:00
def test_ocrmypdf_parameters ( self ) :
parser = RasterisedDocumentParser ( None )
params = parser . construct_ocrmypdf_parameters ( input_file = " input.pdf " , output_file = " output.pdf " ,
sidecar_file = " sidecar.txt " , mime_type = " application/pdf " ,
safe_fallback = False )
self . assertEqual ( params [ ' input_file ' ] , " input.pdf " )
self . assertEqual ( params [ ' output_file ' ] , " output.pdf " )
self . assertEqual ( params [ ' sidecar ' ] , " sidecar.txt " )
with override_settings ( OCR_CLEAN = " none " ) :
params = parser . construct_ocrmypdf_parameters ( " " , " " , " " , " " )
self . assertNotIn ( " clean " , params )
self . assertNotIn ( " clean_final " , params )
with override_settings ( OCR_CLEAN = " clean " ) :
params = parser . construct_ocrmypdf_parameters ( " " , " " , " " , " " )
self . assertTrue ( params [ ' clean ' ] )
self . assertNotIn ( " clean_final " , params )
with override_settings ( OCR_CLEAN = " clean-final " , OCR_MODE = " skip " ) :
params = parser . construct_ocrmypdf_parameters ( " " , " " , " " , " " )
self . assertTrue ( params [ ' clean_final ' ] )
self . assertNotIn ( " clean " , params )
with override_settings ( OCR_CLEAN = " clean-final " , OCR_MODE = " redo " ) :
params = parser . construct_ocrmypdf_parameters ( " " , " " , " " , " " )
self . assertTrue ( params [ ' clean ' ] )
self . assertNotIn ( " clean_final " , params )
with override_settings ( OCR_DESKEW = True , OCR_MODE = " skip " ) :
params = parser . construct_ocrmypdf_parameters ( " " , " " , " " , " " )
self . assertTrue ( params [ ' deskew ' ] )
with override_settings ( OCR_DESKEW = True , OCR_MODE = " redo " ) :
params = parser . construct_ocrmypdf_parameters ( " " , " " , " " , " " )
self . assertNotIn ( ' deskew ' , params )
with override_settings ( OCR_DESKEW = False , OCR_MODE = " skip " ) :
params = parser . construct_ocrmypdf_parameters ( " " , " " , " " , " " )
self . assertNotIn ( ' deskew ' , params )
2020-12-02 17:39:49 +01:00
class TestParserFileTypes ( DirectoriesMixin , TestCase ) :
SAMPLE_FILES = os . path . join ( os . path . dirname ( __file__ ) , " samples " )
def test_bmp ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple.bmp " ) , " image/bmp " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertTrue ( " this is a test document " in parser . get_text ( ) . lower ( ) )
def test_jpg ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple.jpg " ) , " image/jpeg " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertTrue ( " this is a test document " in parser . get_text ( ) . lower ( ) )
@override_settings ( OCR_IMAGE_DPI = 200 )
def test_gif ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple.gif " ) , " image/gif " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertTrue ( " this is a test document " in parser . get_text ( ) . lower ( ) )
def test_tiff ( self ) :
parser = RasterisedDocumentParser ( None )
parser . parse ( os . path . join ( self . SAMPLE_FILES , " simple.tif " ) , " image/tiff " )
self . assertTrue ( os . path . isfile ( parser . archive_path ) )
self . assertTrue ( " this is a test document " in parser . get_text ( ) . lower ( ) )