paperless-ngx/src/documents/tasks.py

import logging

import tqdm
from django.conf import settings
from django.db.models.signals import post_save
from documents import index
from documents import sanity_checker
from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.consumer import Consumer
from documents.consumer import ConsumerError
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException
from whoosh.writing import AsyncWriter

# barcode decoder
import os
from pyzbar import pyzbar
from pdf2image import convert_from_path
import tempfile
from pikepdf import Pdf

logger = logging.getLogger("paperless.tasks")


def index_optimize():
    ix = index.open_index()
    writer = AsyncWriter(ix)
    writer.commit(optimize=True)


def index_reindex(progress_bar_disable=False):
    documents = Document.objects.all()

    ix = index.open_index(recreate=True)

    with AsyncWriter(ix) as writer:
        for document in tqdm.tqdm(documents, disable=progress_bar_disable):
            index.update_document(writer, document)


def train_classifier():
    if (
        not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
        and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
        and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
    ):

        return

    classifier = load_classifier()

    if not classifier:
        classifier = DocumentClassifier()

    try:
        if classifier.train():
            logger.info(
                "Saving updated classifier model to {}...".format(settings.MODEL_FILE),
            )
            classifier.save()
        else:
            logger.debug("Training data unchanged.")

    except Exception as e:
        logger.warning("Classifier error: " + str(e))


def barcode_reader(page) -> list:
    """
    Read any barcodes contained in page
    Returns a list containing all found barcodes
    """
    barcodes = [ ]
    # Decode the barcode image
    detected_barcodes = pyzbar.decode(page)

    if not detected_barcodes:
        logger.debug(f"No barcode detected")
    else:
        # Traverse through all the detected barcodes in image
        for barcode in detected_barcodes:
            if barcode.data!="":
                barcodes = barcodes + [str(barcode.data)]
                logger.debug(f"Barcode of type {str(barcode.type)} found: {str(barcode.data)}")
    return barcodes

def scan_file_for_seperating_barcodes(filepath) -> list:
    """
    Scan the provided file for page seperating barcodes
    Returns a list of pagenumbers, which seperate the file
    """
    seperator_page_numbers = [ ]
    # use a temporary directory in case the file os too big to handle in memory
    with tempfile.TemporaryDirectory() as path:
        pages_from_path = convert_from_path(filepath, output_folder=path)
        for current_page_number, page in enumerate(pages_from_path):
            current_barcodes = barcode_reader(page)
            if current_barcodes.isin("PATCHT"):
                seperator_page_numbers = seperator_page_numbers + current_page_number
    return seperator_page_numbers

def seperate_pages(filepath, pages_to_split_on: list):
    """
    Seperate the provided file on the pages_to_split_on.
    The pages which are defined by page_numbers will be removed.
    """
    pages_to_split_on = scan_file_for_seperating_barcodes(filepath)
    fname = os.path.splitext(os.path.basename(filepath))[0]
    pdf = Pdf.open(filepath)
    # TODO: Get the directory of the file and save the other files there
    # TODO: Return list of new paths of the new files
    for count, page_number in enumerate(pages_to_split_on):
        # First element, so iterate from zero to the first seperator page
        if count == 0:
            dst = Pdf.new()
            for page in range(0, page_number):
                dst.pages.append(page)
            output_filename = '{}_page_{}.pdf'.format(
                fname, str(count))
            with open(output_filename, 'wb') as out:
                dst.save(out)
        else:
            dst = Pdf.new()
            for page in range(pages_to_split_on[count-1], page_number):
                dst.pages.append(page)
            output_filename = '{}_page_{}.pdf'.format(
                fname, page+1)
            with open(output_filename, 'wb') as out:
                dst.save(out)


def consume_file(
    path,
    override_filename=None,
    override_title=None,
    override_correspondent_id=None,
    override_document_type_id=None,
    override_tag_ids=None,
    task_id=None,
):

    # check for seperators in current document
    seperator_page_numbers = scan_file_for_seperating_barcodes(path)
    if seperator_page_numbers != [ ]:
        logger.debug(f"Pages with seperators found: {str(seperator_page_numbers)}")

    document = Consumer().try_consume_file(
        path,
        override_filename=override_filename,
        override_title=override_title,
        override_correspondent_id=override_correspondent_id,
        override_document_type_id=override_document_type_id,
        override_tag_ids=override_tag_ids,
        task_id=task_id,
    )

    if document:
        return "Success. New document id {} created".format(document.pk)
    else:
        raise ConsumerError(
            "Unknown error: Returned document was null, but "
            "no error message was given.",
        )


def sanity_check():
    messages = sanity_checker.check_sanity()

    messages.log_messages()

    if messages.has_error():
        raise SanityCheckFailedException("Sanity check failed with errors. See log.")
    elif messages.has_warning():
        return "Sanity check exited with warnings. See log."
    elif len(messages) > 0:
        return "Sanity check exited with infos. See log."
    else:
        return "No issues detected."


def bulk_update_documents(document_ids):
    documents = Document.objects.filter(id__in=document_ids)

    ix = index.open_index()

    for doc in documents:
        post_save.send(Document, instance=doc, created=False)

    with AsyncWriter(ix) as writer:
        for doc in documents:
            index.update_document(writer, doc)
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00			`import logging`

added a progress bar to the reindex command. 2020-12-10 00:02:45 +01:00			`import tqdm`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00			`from django.conf import settings`
add bulk editing methods 2020-12-11 14:27:54 +01:00			`from django.db.models.signals import post_save`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`from documents import index`
			`from documents import sanity_checker`
			`from documents.classifier import DocumentClassifier`
			`from documents.classifier import load_classifier`
			`from documents.consumer import Consumer`
			`from documents.consumer import ConsumerError`
			`from documents.models import Correspondent`
			`from documents.models import Document`
			`from documents.models import DocumentType`
			`from documents.models import Tag`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`from documents.sanity_checker import SanityCheckFailedException`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`from whoosh.writing import AsyncWriter`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00
add first tests for barcode reader Signed-off-by: florian on nixos (Florian Brandes) <florian.brandes@posteo.de> 2022-03-23 13:26:43 +01:00			`# barcode decoder`
			`import os`
			`from pyzbar import pyzbar`
			`from pdf2image import convert_from_path`
			`import tempfile`
			`from pikepdf import Pdf`

rework most of the logging 2021-02-05 01:10:29 +01:00			`logger = logging.getLogger("paperless.tasks")`


added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00			`def index_optimize():`
fixes to the search index 2020-11-28 11:49:46 +01:00			`ix = index.open_index()`
filename handling for archive files. 2020-11-30 21:38:21 +01:00			`writer = AsyncWriter(ix)`
			`writer.commit(optimize=True)`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00

Add --no-progress-bar option to commands 2021-04-18 15:56:00 +02:00			`def index_reindex(progress_bar_disable=False):`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00			`documents = Document.objects.all()`

			`ix = index.open_index(recreate=True)`

			`with AsyncWriter(ix) as writer:`
Add --no-progress-bar option to commands 2021-04-18 15:56:00 +02:00			`for document in tqdm.tqdm(documents, disable=progress_bar_disable):`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00			`index.update_document(writer, document)`


			`def train_classifier():`
Format Python code with black 2022-02-27 15:26:41 +01:00			`if (`
			`not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()`
			`and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()`
			`and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()`
			`):`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00
centralized classifier loading, better error handling, no error messages when auto matching is not used 2021-01-30 14:22:23 +01:00			`return`

			`classifier = load_classifier()`

			`if not classifier:`
fixes #222 2020-12-30 21:54:36 +01:00			`classifier = DocumentClassifier()`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00
			`try:`
			`if classifier.train():`
rework most of the logging 2021-02-05 01:10:29 +01:00			`logger.info(`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`"Saving updated classifier model to {}...".format(settings.MODEL_FILE),`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00			`)`
classifier caching 2021-02-06 20:54:58 +01:00			`classifier.save()`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00			`else:`
Format Python code with black 2022-02-27 15:26:41 +01:00			`logger.debug("Training data unchanged.")`
added a task scheduler for recurring tasks 2020-11-09 20:29:02 +01:00
			`except Exception as e:`
Format Python code with black 2022-02-27 15:26:41 +01:00			`logger.warning("Classifier error: " + str(e))`
first version of the new consumer. 2020-11-16 18:26:54 +01:00

add first tests for barcode reader Signed-off-by: florian on nixos (Florian Brandes) <florian.brandes@posteo.de> 2022-03-23 13:26:43 +01:00
			`def barcode_reader(page) -> list:`
			`"""`
			`Read any barcodes contained in page`
			`Returns a list containing all found barcodes`
			`"""`
			`barcodes = [ ]`
			`# Decode the barcode image`
			`detected_barcodes = pyzbar.decode(page)`

			`if not detected_barcodes:`
			`logger.debug(f"No barcode detected")`
			`else:`
			`# Traverse through all the detected barcodes in image`
			`for barcode in detected_barcodes:`
			`if barcode.data!="":`
			`barcodes = barcodes + [str(barcode.data)]`
			`logger.debug(f"Barcode of type {str(barcode.type)} found: {str(barcode.data)}")`
			`return barcodes`

			`def scan_file_for_seperating_barcodes(filepath) -> list:`
			`"""`
			`Scan the provided file for page seperating barcodes`
			`Returns a list of pagenumbers, which seperate the file`
			`"""`
			`seperator_page_numbers = [ ]`
			`# use a temporary directory in case the file os too big to handle in memory`
			`with tempfile.TemporaryDirectory() as path:`
			`pages_from_path = convert_from_path(filepath, output_folder=path)`
			`for current_page_number, page in enumerate(pages_from_path):`
			`current_barcodes = barcode_reader(page)`
			`if current_barcodes.isin("PATCHT"):`
			`seperator_page_numbers = seperator_page_numbers + current_page_number`
			`return seperator_page_numbers`

			`def seperate_pages(filepath, pages_to_split_on: list):`
			`"""`
			`Seperate the provided file on the pages_to_split_on.`
			`The pages which are defined by page_numbers will be removed.`
			`"""`
			`pages_to_split_on = scan_file_for_seperating_barcodes(filepath)`
			`fname = os.path.splitext(os.path.basename(filepath))[0]`
			`pdf = Pdf.open(filepath)`
			`# TODO: Get the directory of the file and save the other files there`
			`# TODO: Return list of new paths of the new files`
			`for count, page_number in enumerate(pages_to_split_on):`
			`# First element, so iterate from zero to the first seperator page`
			`if count == 0:`
			`dst = Pdf.new()`
			`for page in range(0, page_number):`
			`dst.pages.append(page)`
			`output_filename = '{}_page_{}.pdf'.format(`
			`fname, str(count))`
			`with open(output_filename, 'wb') as out:`
			`dst.save(out)`
			`else:`
			`dst = Pdf.new()`
			`for page in range(pages_to_split_on[count-1], page_number):`
			`dst.pages.append(page)`
			`output_filename = '{}_page_{}.pdf'.format(`
			`fname, page+1)`
			`with open(output_filename, 'wb') as out:`
			`dst.save(out)`


Format Python code with black 2022-02-27 15:26:41 +01:00			`def consume_file(`
			`path,`
			`override_filename=None,`
			`override_title=None,`
			`override_correspondent_id=None,`
			`override_document_type_id=None,`
			`override_tag_ids=None,`
			`task_id=None,`
			`):`
first version of the new consumer. 2020-11-16 18:26:54 +01:00
add first tests for barcode reader Signed-off-by: florian on nixos (Florian Brandes) <florian.brandes@posteo.de> 2022-03-23 13:26:43 +01:00			`# check for seperators in current document`
			`seperator_page_numbers = scan_file_for_seperating_barcodes(path)`
			`if seperator_page_numbers != [ ]:`
			`logger.debug(f"Pages with seperators found: {str(seperator_page_numbers)}")`

first version of the new consumer. 2020-11-16 18:26:54 +01:00			`document = Consumer().try_consume_file(`
refactor 2020-11-17 11:49:44 +01:00			`path,`
			`override_filename=override_filename,`
			`override_title=override_title,`
			`override_correspondent_id=override_correspondent_id,`
			`override_document_type_id=override_document_type_id,`
task ids 2021-01-26 00:51:20 +01:00			`override_tag_ids=override_tag_ids,`
Format Python code with black 2022-02-27 15:26:41 +01:00			`task_id=task_id,`
task ids 2021-01-26 00:51:20 +01:00			`)`
first version of the new consumer. 2020-11-16 18:26:54 +01:00
			`if document:`
Format Python code with black 2022-02-27 15:26:41 +01:00			`return "Success. New document id {} created".format(document.pk)`
first version of the new consumer. 2020-11-16 18:26:54 +01:00			`else:`
Format Python code with black 2022-02-27 15:26:41 +01:00			`raise ConsumerError(`
			`"Unknown error: Returned document was null, but "`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`"no error message was given.",`
Format Python code with black 2022-02-27 15:26:41 +01:00			`)`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00

			`def sanity_check():`
			`messages = sanity_checker.check_sanity()`

better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`messages.log_messages()`

			`if messages.has_error():`
Format Python code with black 2022-02-27 15:26:41 +01:00			`raise SanityCheckFailedException("Sanity check failed with errors. See log.")`
better sanity checker that logs messages in the log files and does not fail on warnings. 2021-02-14 17:08:29 +01:00			`elif messages.has_warning():`
			`return "Sanity check exited with warnings. See log."`
			`elif len(messages) > 0:`
			`return "Sanity check exited with infos. See log."`
added a simple sanity checker. 2020-11-25 16:04:58 +01:00			`else:`
			`return "No issues detected."`
add bulk editing methods 2020-12-11 14:27:54 +01:00

move the two post bulk edit tasks into one 2020-12-28 13:31:22 +01:00			`def bulk_update_documents(document_ids):`
update index after bulk edit operations #195 2020-12-27 17:05:35 +01:00			`documents = Document.objects.filter(id__in=document_ids)`

			`ix = index.open_index()`
Do file renaming first, since this is the important step, and indexing takes a while. 2020-12-30 17:18:27 +01:00
			`for doc in documents:`
			`post_save.send(Document, instance=doc, created=False)`

update index after bulk edit operations #195 2020-12-27 17:05:35 +01:00			`with AsyncWriter(ix) as writer:`
			`for doc in documents:`
			`index.update_document(writer, doc)`