paperless-ngx/src/documents/consumer.py

236 lines
6.8 KiB
Python
Raw Normal View History

from django.db import transaction
import datetime
import hashlib
import logging
import os
import re
import uuid
from django.conf import settings
from django.utils import timezone
from paperless.db import GnuPG
from .classifier import DocumentClassifier
from .models import Document, FileInfo, Tag
from .parsers import ParseError
from .signals import (
document_consumer_declaration,
document_consumption_finished,
document_consumption_started
)
class ConsumerError(Exception):
pass
2018-02-18 15:55:55 +00:00
class Consumer:
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale pnm
2. Use tesseract on the pnm
2018-05-29 23:59:33 +02:00
3. Store the document in the MEDIA_ROOT with optional encryption
4. Store the OCR'd text in the database
5. Delete the document and image(s)
"""
2018-02-25 19:20:51 +01:00
def __init__(self, consume=settings.CONSUMPTION_DIR,
scratch=settings.SCRATCH_DIR):
2016-02-27 20:18:50 +00:00
self.logger = logging.getLogger(__name__)
self.logging_group = None
self.consume = consume
self.scratch = scratch
self.classifier = DocumentClassifier()
os.makedirs(self.scratch, exist_ok=True)
self.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
if settings.PASSPHRASE:
self.storage_type = Document.STORAGE_TYPE_GPG
if not self.consume:
raise ConsumerError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)
if not os.path.exists(self.consume):
raise ConsumerError(
"Consumption directory {} does not exist".format(self.consume))
self.parsers = []
for response in document_consumer_declaration.send(self):
self.parsers.append(response[1])
if not self.parsers:
raise ConsumerError(
"No parsers could be found, not even the default. "
"This is a problem."
)
2016-02-27 20:18:50 +00:00
def log(self, level, message):
getattr(self.logger, level)(message, extra={
"group": self.logging_group
2016-02-27 20:18:50 +00:00
})
@transaction.atomic
def try_consume_file(self, file):
"""
Return True if file was consumed
"""
if not re.match(FileInfo.REGEXES["title"], file):
return False
2016-02-27 20:18:50 +00:00
doc = file
if self._is_duplicate(doc):
self.log(
"info",
"Skipping {} as it appears to be a duplicate".format(doc)
)
2018-05-11 14:01:17 +02:00
return False
parser_class = self._get_parser_class(doc)
if not parser_class:
self.log(
"error", "No parsers could be found for {}".format(doc))
2018-05-11 14:01:17 +02:00
return False
self.logging_group = uuid.uuid4()
self.log("info", "Consuming {}".format(doc))
document_consumption_started.send(
sender=self.__class__,
filename=doc,
logging_group=self.logging_group
)
parsed_document = parser_class(doc)
try:
thumbnail = parsed_document.get_optimised_thumbnail()
date = parsed_document.get_date()
document = self._store(
parsed_document.get_text(),
doc,
thumbnail,
date
)
except ParseError as e:
self.log("error", "PARSE FAILURE for {}: {}".format(doc, e))
parsed_document.cleanup()
2018-05-11 14:01:17 +02:00
return False
else:
parsed_document.cleanup()
self._cleanup_doc(doc)
self.log(
"info",
"Document {} consumption finished".format(document)
)
2016-10-26 09:52:09 +00:00
classifier = None
try:
self.classifier.reload()
classifier = self.classifier
except FileNotFoundError:
logging.getLogger(__name__).warning("Cannot classify documents, "
"classifier model file was not "
"found.")
document_consumption_finished.send(
sender=self.__class__,
document=document,
logging_group=self.logging_group,
classifier=classifier
)
2018-05-11 14:01:17 +02:00
return True
def _get_parser_class(self, doc):
2016-02-14 17:13:48 +00:00
"""
Determine the appropriate parser class based on the file
2016-02-14 17:13:48 +00:00
"""
options = []
for parser in self.parsers:
result = parser(doc)
if result:
options.append(result)
self.log(
"info",
"Parsers available: {}".format(
", ".join([str(o["parser"].__name__) for o in options])
)
)
if not options:
return None
# Return the parser with the highest weight.
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
def _store(self, text, doc, thumbnail, date):
file_info = FileInfo.from_path(doc)
stats = os.stat(doc)
2016-02-27 20:18:50 +00:00
self.log("debug", "Saving record to database")
created = file_info.created or date or timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
2016-04-11 23:28:12 +01:00
with open(doc, "rb") as f:
document = Document.objects.create(
correspondent=file_info.correspondent,
title=file_info.title,
content=text,
file_type=file_info.extension,
checksum=hashlib.md5(f.read()).hexdigest(),
created=created,
modified=created,
storage_type=self.storage_type
2016-04-11 23:28:12 +01:00
)
2018-09-27 20:41:16 +02:00
relevant_tags = set(file_info.tags)
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self.log("debug", "Tagging with {}".format(tag_names))
document.tags.add(*relevant_tags)
self._write(document, doc, document.source_path)
self._write(document, thumbnail, document.thumbnail_path)
document.save()
2016-02-27 20:18:50 +00:00
self.log("info", "Completed")
return document
def _write(self, document, source, target):
with open(source, "rb") as read_file:
with open(target, "wb") as write_file:
if document.storage_type == Document.STORAGE_TYPE_UNENCRYPTED:
write_file.write(read_file.read())
return
self.log("debug", "Encrypting")
write_file.write(GnuPG.encrypted(read_file))
2016-02-27 20:18:50 +00:00
def _cleanup_doc(self, doc):
self.log("debug", "Deleting document {}".format(doc))
os.unlink(doc)
2016-04-03 18:44:00 +01:00
@staticmethod
def _is_duplicate(doc):
with open(doc, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
return Document.objects.filter(checksum=checksum).exists()