mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-16 03:26:50 +01:00
use imagehash instead of bitwise hashing
This commit is contained in:
parent
ebe21a0114
commit
1fa735eb23
4 changed files with 144 additions and 113 deletions
|
|
@ -2,7 +2,6 @@ import datetime
|
|||
import os
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
from documents.parsers import ParseError
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
import hashlib
|
||||
import os
|
||||
from unittest import mock
|
||||
from urllib.error import HTTPError
|
||||
|
|
@ -8,8 +7,10 @@ import pytest
|
|||
from django.test import TestCase
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import run_convert
|
||||
from imagehash import average_hash
|
||||
from paperless_mail.parsers import MailDocumentParser
|
||||
from pdfminer.high_level import extract_text
|
||||
from PIL import Image
|
||||
|
||||
|
||||
class TestParserLive(TestCase):
|
||||
|
|
@ -22,16 +23,8 @@ class TestParserLive(TestCase):
|
|||
self.parser.cleanup()
|
||||
|
||||
@staticmethod
|
||||
def hashfile(file):
|
||||
buf_size = 65536 # An arbitrary (but fixed) buffer
|
||||
sha256 = hashlib.sha256()
|
||||
with open(file, "rb") as f:
|
||||
while True:
|
||||
data = f.read(buf_size)
|
||||
if not data:
|
||||
break
|
||||
sha256.update(data)
|
||||
return sha256.hexdigest()
|
||||
def imagehash(file, hash_size=18):
|
||||
return f"{average_hash(Image.open(file), hash_size)}"
|
||||
|
||||
# Only run if convert is available
|
||||
@pytest.mark.skipif(
|
||||
|
|
@ -53,8 +46,8 @@ class TestParserLive(TestCase):
|
|||
expected = os.path.join(self.SAMPLE_FILES, "simple_text.eml.pdf.webp")
|
||||
|
||||
self.assertEqual(
|
||||
self.hashfile(thumb),
|
||||
self.hashfile(expected),
|
||||
self.imagehash(thumb),
|
||||
self.imagehash(expected),
|
||||
f"Created Thumbnail {thumb} differs from expected file {expected}",
|
||||
)
|
||||
|
||||
|
|
@ -158,10 +151,10 @@ class TestParserLive(TestCase):
|
|||
logging_group=None,
|
||||
)
|
||||
self.assertTrue(os.path.isfile(converted))
|
||||
thumb_hash = self.hashfile(converted)
|
||||
thumb_hash = self.imagehash(converted)
|
||||
|
||||
# The created pdf is not reproducible. But the converted image should always look the same.
|
||||
expected_hash = self.hashfile(
|
||||
expected_hash = self.imagehash(
|
||||
os.path.join(self.SAMPLE_FILES, "html.eml.pdf.webp"),
|
||||
)
|
||||
self.assertEqual(
|
||||
|
|
@ -244,10 +237,10 @@ class TestParserLive(TestCase):
|
|||
logging_group=None,
|
||||
)
|
||||
self.assertTrue(os.path.isfile(converted))
|
||||
thumb_hash = self.hashfile(converted)
|
||||
thumb_hash = self.imagehash(converted)
|
||||
|
||||
# The created pdf is not reproducible. But the converted image should always look the same.
|
||||
expected_hash = self.hashfile(
|
||||
expected_hash = self.imagehash(
|
||||
os.path.join(self.SAMPLE_FILES, "sample.html.pdf.webp"),
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue