paperless-ngx/src/documents/tasks.py

120 lines
3.3 KiB
Python
Raw Normal View History

import logging
import tqdm
from django.conf import settings
2020-12-11 14:27:54 +01:00
from django.db.models.signals import post_save
from whoosh.writing import AsyncWriter
2020-11-25 16:04:58 +01:00
from documents import index, sanity_checker
from documents.classifier import DocumentClassifier, load_classifier
2020-11-16 18:26:54 +01:00
from documents.consumer import Consumer, ConsumerError
from documents.models import Document, Tag, DocumentType, Correspondent
from documents.sanity_checker import SanityCheckFailedException
2021-02-05 01:10:29 +01:00
logger = logging.getLogger("paperless.tasks")
def index_optimize():
2020-11-28 11:49:46 +01:00
ix = index.open_index()
2020-11-30 21:38:21 +01:00
writer = AsyncWriter(ix)
writer.commit(optimize=True)
def index_reindex(progress_bar_disable=False):
documents = Document.objects.all()
ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer:
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
index.update_document(writer, document)
def train_classifier():
if (not Tag.objects.filter(
matching_algorithm=Tag.MATCH_AUTO).exists() and
not DocumentType.objects.filter(
matching_algorithm=Tag.MATCH_AUTO).exists() and
not Correspondent.objects.filter(
matching_algorithm=Tag.MATCH_AUTO).exists()):
return
classifier = load_classifier()
if not classifier:
2020-12-30 21:54:36 +01:00
classifier = DocumentClassifier()
try:
if classifier.train():
2021-02-05 01:10:29 +01:00
logger.info(
"Saving updated classifier model to {}...".format(
settings.MODEL_FILE)
)
2021-02-06 20:54:58 +01:00
classifier.save()
else:
2021-02-05 01:10:29 +01:00
logger.debug(
"Training data unchanged."
)
except Exception as e:
2021-02-05 01:10:29 +01:00
logger.warning(
"Classifier error: " + str(e)
)
2020-11-16 18:26:54 +01:00
2020-11-17 11:49:44 +01:00
def consume_file(path,
override_filename=None,
override_title=None,
override_correspondent_id=None,
override_document_type_id=None,
2021-01-26 00:51:20 +01:00
override_tag_ids=None,
task_id=None):
2020-11-16 18:26:54 +01:00
document = Consumer().try_consume_file(
2020-11-17 11:49:44 +01:00
path,
override_filename=override_filename,
override_title=override_title,
override_correspondent_id=override_correspondent_id,
override_document_type_id=override_document_type_id,
2021-01-26 00:51:20 +01:00
override_tag_ids=override_tag_ids,
task_id=task_id
)
2020-11-16 18:26:54 +01:00
if document:
return "Success. New document id {} created".format(
document.pk
)
else:
raise ConsumerError("Unknown error: Returned document was null, but "
"no error message was given.")
2020-11-25 16:04:58 +01:00
def sanity_check():
messages = sanity_checker.check_sanity()
messages.log_messages()
if messages.has_error():
raise SanityCheckFailedException(
"Sanity check failed with errors. See log.")
elif messages.has_warning():
return "Sanity check exited with warnings. See log."
elif len(messages) > 0:
return "Sanity check exited with infos. See log."
2020-11-25 16:04:58 +01:00
else:
return "No issues detected."
2020-12-11 14:27:54 +01:00
def bulk_update_documents(document_ids):
documents = Document.objects.filter(id__in=document_ids)
ix = index.open_index()
for doc in documents:
post_save.send(Document, instance=doc, created=False)
with AsyncWriter(ix) as writer:
for doc in documents:
index.update_document(writer, doc)