2022-08-21 18:20:59 -07:00
|
|
|
import hashlib
|
2020-11-09 20:29:02 +01:00
|
|
|
import logging
|
2022-03-24 18:17:29 +01:00
|
|
|
import os
|
|
|
|
|
import shutil
|
2022-08-21 18:20:59 -07:00
|
|
|
import uuid
|
2022-06-22 05:53:13 -07:00
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Type
|
2020-11-09 20:29:02 +01:00
|
|
|
|
2023-01-15 15:06:35 -08:00
|
|
|
import dateutil.parser
|
2020-12-10 00:02:45 +01:00
|
|
|
import tqdm
|
2022-03-26 09:28:19 +01:00
|
|
|
from asgiref.sync import async_to_sync
|
2022-09-01 16:25:11 -07:00
|
|
|
from celery import shared_task
|
2022-03-26 09:28:19 +01:00
|
|
|
from channels.layers import get_channel_layer
|
2020-11-09 20:29:02 +01:00
|
|
|
from django.conf import settings
|
2022-08-21 18:20:59 -07:00
|
|
|
from django.db import transaction
|
2020-12-11 14:27:54 +01:00
|
|
|
from django.db.models.signals import post_save
|
2022-06-13 17:11:22 -07:00
|
|
|
from documents import barcodes
|
2022-03-11 10:55:51 -08:00
|
|
|
from documents import index
|
|
|
|
|
from documents import sanity_checker
|
|
|
|
|
from documents.classifier import DocumentClassifier
|
|
|
|
|
from documents.classifier import load_classifier
|
|
|
|
|
from documents.consumer import Consumer
|
|
|
|
|
from documents.consumer import ConsumerError
|
2022-08-21 18:20:59 -07:00
|
|
|
from documents.file_handling import create_source_path_directory
|
|
|
|
|
from documents.file_handling import generate_unique_filename
|
2022-03-11 10:55:51 -08:00
|
|
|
from documents.models import Correspondent
|
|
|
|
|
from documents.models import Document
|
|
|
|
|
from documents.models import DocumentType
|
2022-05-19 23:42:25 +02:00
|
|
|
from documents.models import StoragePath
|
2022-03-11 10:55:51 -08:00
|
|
|
from documents.models import Tag
|
2022-06-22 05:53:13 -07:00
|
|
|
from documents.parsers import DocumentParser
|
|
|
|
|
from documents.parsers import get_parser_class_for_mime_type
|
2021-02-14 17:08:29 +01:00
|
|
|
from documents.sanity_checker import SanityCheckFailedException
|
2022-08-21 18:20:59 -07:00
|
|
|
from filelock import FileLock
|
2022-10-10 14:21:42 -07:00
|
|
|
from redis.exceptions import ConnectionError
|
2022-03-24 18:17:29 +01:00
|
|
|
from whoosh.writing import AsyncWriter
|
2022-03-24 21:30:34 +01:00
|
|
|
|
2022-04-01 12:38:14 +02:00
|
|
|
|
2021-02-05 01:10:29 +01:00
|
|
|
logger = logging.getLogger("paperless.tasks")
|
|
|
|
|
|
|
|
|
|
|
2022-09-01 16:25:11 -07:00
|
|
|
@shared_task
|
2020-11-09 20:29:02 +01:00
|
|
|
def index_optimize():
|
2020-11-28 11:49:46 +01:00
|
|
|
ix = index.open_index()
|
2020-11-30 21:38:21 +01:00
|
|
|
writer = AsyncWriter(ix)
|
|
|
|
|
writer.commit(optimize=True)
|
2020-11-09 20:29:02 +01:00
|
|
|
|
|
|
|
|
|
2021-04-18 15:56:00 +02:00
|
|
|
def index_reindex(progress_bar_disable=False):
|
2020-11-09 20:29:02 +01:00
|
|
|
documents = Document.objects.all()
|
|
|
|
|
|
|
|
|
|
ix = index.open_index(recreate=True)
|
|
|
|
|
|
|
|
|
|
with AsyncWriter(ix) as writer:
|
2021-04-18 15:56:00 +02:00
|
|
|
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
|
2020-11-09 20:29:02 +01:00
|
|
|
index.update_document(writer, document)
|
|
|
|
|
|
|
|
|
|
|
2022-09-01 16:25:11 -07:00
|
|
|
@shared_task
|
2020-11-09 20:29:02 +01:00
|
|
|
def train_classifier():
|
2022-02-27 15:26:41 +01:00
|
|
|
if (
|
|
|
|
|
not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
|
|
|
|
|
and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
|
|
|
|
|
and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
|
2022-05-19 23:42:25 +02:00
|
|
|
and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
|
2022-02-27 15:26:41 +01:00
|
|
|
):
|
2020-11-09 20:29:02 +01:00
|
|
|
|
2021-01-30 14:22:23 +01:00
|
|
|
return
|
|
|
|
|
|
|
|
|
|
classifier = load_classifier()
|
|
|
|
|
|
|
|
|
|
if not classifier:
|
2020-12-30 21:54:36 +01:00
|
|
|
classifier = DocumentClassifier()
|
2020-11-09 20:29:02 +01:00
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
if classifier.train():
|
2021-02-05 01:10:29 +01:00
|
|
|
logger.info(
|
2022-05-06 09:04:08 -07:00
|
|
|
f"Saving updated classifier model to {settings.MODEL_FILE}...",
|
2020-11-09 20:29:02 +01:00
|
|
|
)
|
2021-02-06 20:54:58 +01:00
|
|
|
classifier.save()
|
2020-11-09 20:29:02 +01:00
|
|
|
else:
|
2022-02-27 15:26:41 +01:00
|
|
|
logger.debug("Training data unchanged.")
|
2020-11-09 20:29:02 +01:00
|
|
|
|
|
|
|
|
except Exception as e:
|
2022-02-27 15:26:41 +01:00
|
|
|
logger.warning("Classifier error: " + str(e))
|
2020-11-16 18:26:54 +01:00
|
|
|
|
|
|
|
|
|
2022-09-01 16:25:11 -07:00
|
|
|
@shared_task
|
2022-02-27 15:26:41 +01:00
|
|
|
def consume_file(
|
|
|
|
|
path,
|
|
|
|
|
override_filename=None,
|
|
|
|
|
override_title=None,
|
|
|
|
|
override_correspondent_id=None,
|
|
|
|
|
override_document_type_id=None,
|
|
|
|
|
override_tag_ids=None,
|
|
|
|
|
task_id=None,
|
2022-05-18 11:56:32 +02:00
|
|
|
override_created=None,
|
2022-02-27 15:26:41 +01:00
|
|
|
):
|
2020-11-16 18:26:54 +01:00
|
|
|
|
2022-07-29 10:22:54 -07:00
|
|
|
path = Path(path).resolve()
|
|
|
|
|
|
2022-11-22 10:11:27 -08:00
|
|
|
# Celery converts this to a string, but everything expects a datetime
|
|
|
|
|
# Long term solution is to not use JSON for the serializer but pickle instead
|
2022-12-08 18:48:43 -08:00
|
|
|
# TODO: This will be resolved in kombu 5.3, expected with celery 5.3
|
|
|
|
|
# More types will be retained through JSON encode/decode
|
2022-11-22 10:11:27 -08:00
|
|
|
if override_created is not None and isinstance(override_created, str):
|
|
|
|
|
try:
|
2023-01-15 15:06:35 -08:00
|
|
|
override_created = dateutil.parser.isoparse(override_created)
|
2022-11-22 10:11:27 -08:00
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
2022-03-24 11:33:24 +01:00
|
|
|
# check for separators in current document
|
2022-03-26 10:16:23 +01:00
|
|
|
if settings.CONSUMER_ENABLE_BARCODES:
|
2022-06-13 17:11:22 -07:00
|
|
|
|
2022-09-14 11:49:22 -07:00
|
|
|
pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path)
|
2022-06-13 17:11:22 -07:00
|
|
|
|
2022-09-14 11:49:22 -07:00
|
|
|
if separators:
|
|
|
|
|
logger.debug(
|
|
|
|
|
f"Pages with separators found in: {str(path)}",
|
2022-04-16 21:56:10 +02:00
|
|
|
)
|
2022-09-14 11:49:22 -07:00
|
|
|
document_list = barcodes.separate_pages(pdf_filepath, separators)
|
2022-04-20 00:55:45 +02:00
|
|
|
|
2022-04-16 21:56:10 +02:00
|
|
|
if document_list:
|
|
|
|
|
for n, document in enumerate(document_list):
|
|
|
|
|
# save to consumption dir
|
|
|
|
|
# rename it to the original filename with number prefix
|
|
|
|
|
if override_filename:
|
|
|
|
|
newname = f"{str(n)}_" + override_filename
|
|
|
|
|
else:
|
|
|
|
|
newname = None
|
2022-10-08 12:30:43 -07:00
|
|
|
|
|
|
|
|
# If the file is an upload, it's in the scratch directory
|
|
|
|
|
# Move it to consume directory to be picked up
|
|
|
|
|
# Otherwise, use the current parent to keep possible tags
|
|
|
|
|
# from subdirectories
|
2022-10-08 12:50:16 -07:00
|
|
|
try:
|
|
|
|
|
# is_relative_to would be nicer, but new in 3.9
|
|
|
|
|
_ = path.relative_to(settings.SCRATCH_DIR)
|
2022-10-08 12:30:43 -07:00
|
|
|
save_to_dir = settings.CONSUMPTION_DIR
|
2022-10-08 12:50:16 -07:00
|
|
|
except ValueError:
|
2022-10-08 12:30:43 -07:00
|
|
|
save_to_dir = path.parent
|
|
|
|
|
|
2022-07-29 10:22:54 -07:00
|
|
|
barcodes.save_to_dir(
|
|
|
|
|
document,
|
|
|
|
|
newname=newname,
|
2022-10-08 12:30:43 -07:00
|
|
|
target_dir=save_to_dir,
|
2022-07-29 10:22:54 -07:00
|
|
|
)
|
2022-06-14 07:07:44 -07:00
|
|
|
|
2022-09-14 11:49:22 -07:00
|
|
|
# Delete the PDF file which was split
|
|
|
|
|
os.remove(pdf_filepath)
|
|
|
|
|
|
|
|
|
|
# If the original was a TIFF, remove the original file as well
|
|
|
|
|
if str(pdf_filepath) != str(path):
|
|
|
|
|
logger.debug(f"Deleting file {path}")
|
|
|
|
|
os.unlink(path)
|
2022-06-14 07:07:44 -07:00
|
|
|
|
2022-04-16 21:56:10 +02:00
|
|
|
# notify the sender, otherwise the progress bar
|
|
|
|
|
# in the UI stays stuck
|
|
|
|
|
payload = {
|
|
|
|
|
"filename": override_filename,
|
|
|
|
|
"task_id": task_id,
|
|
|
|
|
"current_progress": 100,
|
|
|
|
|
"max_progress": 100,
|
|
|
|
|
"status": "SUCCESS",
|
|
|
|
|
"message": "finished",
|
|
|
|
|
}
|
|
|
|
|
try:
|
|
|
|
|
async_to_sync(get_channel_layer().group_send)(
|
|
|
|
|
"status_updates",
|
|
|
|
|
{"type": "status_update", "data": payload},
|
|
|
|
|
)
|
2022-10-10 14:21:42 -07:00
|
|
|
except ConnectionError as e:
|
|
|
|
|
logger.warning(f"ConnectionError on status send: {str(e)}")
|
2022-04-19 13:07:42 +02:00
|
|
|
# consuming stops here, since the original document with
|
|
|
|
|
# the barcodes has been split and will be consumed separately
|
2022-04-16 21:56:10 +02:00
|
|
|
return "File successfully split"
|
2022-04-06 21:22:07 +02:00
|
|
|
|
2023-01-15 15:55:00 +01:00
|
|
|
# try reading ASN barcodes
|
|
|
|
|
asn = None
|
|
|
|
|
if settings.CONSUMER_ENABLE_ASN_BARCODE:
|
|
|
|
|
_, asn = barcodes.scan_file_for_asn_barcode(path)
|
|
|
|
|
if asn:
|
|
|
|
|
logger.info(f"Using ASN {asn} from barcode")
|
|
|
|
|
|
2022-04-06 21:22:07 +02:00
|
|
|
# continue with consumption if no barcode was found
|
2020-11-16 18:26:54 +01:00
|
|
|
document = Consumer().try_consume_file(
|
2020-11-17 11:49:44 +01:00
|
|
|
path,
|
|
|
|
|
override_filename=override_filename,
|
|
|
|
|
override_title=override_title,
|
|
|
|
|
override_correspondent_id=override_correspondent_id,
|
|
|
|
|
override_document_type_id=override_document_type_id,
|
2021-01-26 00:51:20 +01:00
|
|
|
override_tag_ids=override_tag_ids,
|
2022-02-27 15:26:41 +01:00
|
|
|
task_id=task_id,
|
2022-05-18 11:56:32 +02:00
|
|
|
override_created=override_created,
|
2023-01-15 15:55:00 +01:00
|
|
|
override_asn=asn
|
2021-01-26 00:51:20 +01:00
|
|
|
)
|
2020-11-16 18:26:54 +01:00
|
|
|
|
|
|
|
|
if document:
|
2022-05-06 09:04:08 -07:00
|
|
|
return f"Success. New document id {document.pk} created"
|
2020-11-16 18:26:54 +01:00
|
|
|
else:
|
2022-02-27 15:26:41 +01:00
|
|
|
raise ConsumerError(
|
|
|
|
|
"Unknown error: Returned document was null, but "
|
2022-03-11 10:55:51 -08:00
|
|
|
"no error message was given.",
|
2022-02-27 15:26:41 +01:00
|
|
|
)
|
2020-11-25 16:04:58 +01:00
|
|
|
|
|
|
|
|
|
2022-09-01 16:25:11 -07:00
|
|
|
@shared_task
|
2020-11-25 16:04:58 +01:00
|
|
|
def sanity_check():
|
|
|
|
|
messages = sanity_checker.check_sanity()
|
|
|
|
|
|
2021-02-14 17:08:29 +01:00
|
|
|
messages.log_messages()
|
|
|
|
|
|
2022-05-30 17:03:33 -07:00
|
|
|
if messages.has_error:
|
2022-02-27 15:26:41 +01:00
|
|
|
raise SanityCheckFailedException("Sanity check failed with errors. See log.")
|
2022-05-30 17:03:33 -07:00
|
|
|
elif messages.has_warning:
|
2021-02-14 17:08:29 +01:00
|
|
|
return "Sanity check exited with warnings. See log."
|
|
|
|
|
elif len(messages) > 0:
|
|
|
|
|
return "Sanity check exited with infos. See log."
|
2020-11-25 16:04:58 +01:00
|
|
|
else:
|
|
|
|
|
return "No issues detected."
|
2020-12-11 14:27:54 +01:00
|
|
|
|
|
|
|
|
|
2022-09-01 16:25:11 -07:00
|
|
|
@shared_task
|
2020-12-28 13:31:22 +01:00
|
|
|
def bulk_update_documents(document_ids):
|
2020-12-27 17:05:35 +01:00
|
|
|
documents = Document.objects.filter(id__in=document_ids)
|
|
|
|
|
|
|
|
|
|
ix = index.open_index()
|
2020-12-30 17:18:27 +01:00
|
|
|
|
|
|
|
|
for doc in documents:
|
|
|
|
|
post_save.send(Document, instance=doc, created=False)
|
|
|
|
|
|
2020-12-27 17:05:35 +01:00
|
|
|
with AsyncWriter(ix) as writer:
|
|
|
|
|
for doc in documents:
|
|
|
|
|
index.update_document(writer, doc)
|
2022-06-22 05:53:13 -07:00
|
|
|
|
|
|
|
|
|
2022-09-01 16:25:11 -07:00
|
|
|
@shared_task
|
2022-08-21 18:20:59 -07:00
|
|
|
def update_document_archive_file(document_id):
|
|
|
|
|
"""
|
|
|
|
|
Re-creates the archive file of a document, including new OCR content and thumbnail
|
|
|
|
|
"""
|
|
|
|
|
document = Document.objects.get(id=document_id)
|
2022-06-22 05:53:13 -07:00
|
|
|
|
2022-08-21 18:20:59 -07:00
|
|
|
mime_type = document.mime_type
|
2022-06-22 05:53:13 -07:00
|
|
|
|
2022-08-21 18:20:59 -07:00
|
|
|
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
|
|
|
|
|
|
|
|
|
|
if not parser_class:
|
|
|
|
|
logger.error(
|
|
|
|
|
f"No parser found for mime type {mime_type}, cannot "
|
|
|
|
|
f"archive document {document} (ID: {document_id})",
|
2022-06-22 05:53:13 -07:00
|
|
|
)
|
2022-08-21 18:20:59 -07:00
|
|
|
return
|
|
|
|
|
|
|
|
|
|
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
parser.parse(document.source_path, mime_type, document.get_public_filename())
|
|
|
|
|
|
|
|
|
|
thumbnail = parser.get_thumbnail(
|
|
|
|
|
document.source_path,
|
|
|
|
|
mime_type,
|
|
|
|
|
document.get_public_filename(),
|
2022-06-22 05:53:13 -07:00
|
|
|
)
|
|
|
|
|
|
2022-08-21 18:20:59 -07:00
|
|
|
if parser.get_archive_path():
|
|
|
|
|
with transaction.atomic():
|
|
|
|
|
with open(parser.get_archive_path(), "rb") as f:
|
|
|
|
|
checksum = hashlib.md5(f.read()).hexdigest()
|
|
|
|
|
# I'm going to save first so that in case the file move
|
|
|
|
|
# fails, the database is rolled back.
|
|
|
|
|
# We also don't use save() since that triggers the filehandling
|
|
|
|
|
# logic, and we don't want that yet (file not yet in place)
|
|
|
|
|
document.archive_filename = generate_unique_filename(
|
|
|
|
|
document,
|
|
|
|
|
archive_filename=True,
|
|
|
|
|
)
|
|
|
|
|
Document.objects.filter(pk=document.pk).update(
|
|
|
|
|
archive_checksum=checksum,
|
|
|
|
|
content=parser.get_text(),
|
|
|
|
|
archive_filename=document.archive_filename,
|
|
|
|
|
)
|
|
|
|
|
with FileLock(settings.MEDIA_LOCK):
|
|
|
|
|
create_source_path_directory(document.archive_path)
|
|
|
|
|
shutil.move(parser.get_archive_path(), document.archive_path)
|
|
|
|
|
shutil.move(thumbnail, document.thumbnail_path)
|
2022-06-22 05:53:13 -07:00
|
|
|
|
2022-08-21 18:20:59 -07:00
|
|
|
with index.open_index_writer() as writer:
|
|
|
|
|
index.update_document(writer, document)
|
2022-06-22 05:53:13 -07:00
|
|
|
|
2022-08-21 18:20:59 -07:00
|
|
|
except Exception:
|
|
|
|
|
logger.exception(
|
|
|
|
|
f"Error while parsing document {document} " f"(ID: {document_id})",
|
|
|
|
|
)
|
|
|
|
|
finally:
|
|
|
|
|
parser.cleanup()
|