paperless-ngx/src/documents/management/commands/document_consumer.py

268 lines
7.4 KiB
Python
Raw Normal View History

2015-12-26 13:21:33 +00:00
import datetime
2015-12-20 19:23:33 +00:00
import glob
2016-01-21 12:50:22 -05:00
import langdetect
2015-12-20 19:23:33 +00:00
import os
import random
import re
import subprocess
import time
2015-12-20 19:23:33 +00:00
import pyocr
from PIL import Image
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from django.template.defaultfilters import slugify
2015-12-26 13:21:33 +00:00
from django.utils import timezone
2015-12-20 19:23:33 +00:00
2016-01-23 02:58:03 +00:00
from paperless.db import GnuPG
2016-01-23 02:33:29 +00:00
from ...languages import ISO639
2016-01-28 07:23:11 +00:00
from ...models import Document, Sender, Tag
2015-12-20 19:23:33 +00:00
2016-01-23 02:33:29 +00:00
class OCRError(BaseException):
pass
2015-12-20 19:23:33 +00:00
class Command(BaseCommand):
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale png
2. Use tesseract on the png
3. Encrypt and store the PDF in the MEDIA_ROOT
4. Store the OCR'd text in the database
5. Delete the pdf and image(s)
2015-12-20 19:23:33 +00:00
"""
2015-12-26 13:21:33 +00:00
LOOP_TIME = 10 # Seconds
2015-12-20 19:23:33 +00:00
CONVERT = settings.CONVERT_BINARY
SCRATCH = settings.SCRATCH_DIR
CONSUME = settings.CONSUMPTION_DIR
OCR = pyocr.get_available_tools()[0]
2016-01-23 02:33:29 +00:00
DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE
2015-12-20 19:23:33 +00:00
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX_TITLE = re.compile(r"^.*/(.*)\.pdf$")
PARSER_REGEX_SENDER_TITLE = re.compile(r"^.*/(.*) - (.*)\.pdf$")
2015-12-20 19:23:33 +00:00
def __init__(self, *args, **kwargs):
2016-01-21 12:50:22 -05:00
2015-12-20 19:23:33 +00:00
self.verbosity = 0
self.stats = {}
2016-01-23 02:33:29 +00:00
self._ignore = []
2016-01-21 12:50:22 -05:00
2015-12-20 19:23:33 +00:00
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
self._setup()
try:
while True:
self.loop()
2015-12-26 13:21:33 +00:00
time.sleep(self.LOOP_TIME)
if self.verbosity > 1:
print(".")
except KeyboardInterrupt:
print("Exiting")
def loop(self):
2015-12-20 19:23:33 +00:00
for pdf in os.listdir(self.CONSUME):
pdf = os.path.join(self.CONSUME, pdf)
if not os.path.isfile(pdf):
2015-12-20 19:23:33 +00:00
continue
if not re.match(self.PARSER_REGEX_TITLE, pdf):
2015-12-20 19:23:33 +00:00
continue
2016-01-23 02:33:29 +00:00
if pdf in self._ignore:
continue
if self._is_ready(pdf):
continue
2016-01-01 16:13:59 +00:00
self._render("Consuming {}".format(pdf), 1)
2015-12-20 19:23:33 +00:00
pngs = self._get_greyscale(pdf)
2016-01-23 02:33:29 +00:00
try:
text = self._get_ocr(pngs)
except OCRError:
self._ignore.append(pdf)
self._render("OCR FAILURE: {}".format(pdf), 0)
continue
2015-12-20 19:23:33 +00:00
2016-01-01 16:13:59 +00:00
self._store(text, pdf)
self._cleanup(pngs, pdf)
2015-12-20 19:23:33 +00:00
def _setup(self):
2016-01-23 03:42:39 +00:00
if not self.CONSUME:
raise CommandError(
"The CONSUMPTION_DIR settings variable does not appear to be "
"set."
)
if not os.path.exists(self.CONSUME):
raise CommandError("Consumption directory {} does not exist".format(
self.CONSUME))
2016-01-01 16:13:59 +00:00
for d in (self.SCRATCH, self.MEDIA_PDF):
2015-12-20 19:23:33 +00:00
try:
os.makedirs(d)
except FileExistsError:
pass
def _is_ready(self, pdf):
"""
Detect whether `pdf` is ready to consume or if it's still being written
to by the scanner.
"""
t = os.stat(pdf).st_mtime
if self.stats.get(pdf) == t:
del(self.stats[pdf])
return True
self.stats[pdf] = t
return False
2015-12-20 19:23:33 +00:00
def _get_greyscale(self, pdf):
2016-01-01 16:13:59 +00:00
self._render(" Generating greyscale image", 2)
i = random.randint(1000000, 9999999)
2015-12-20 19:23:33 +00:00
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", pdf, png
)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs):
2016-01-23 03:42:39 +00:00
self._render(" OCRing the PDF", 2)
2016-01-01 16:13:59 +00:00
2016-01-23 02:33:29 +00:00
raw_text = self._ocr(pngs, self.DEFAULT_OCR_LANGUAGE)
2016-01-21 12:50:22 -05:00
guessed_language = langdetect.detect(raw_text)
2016-01-23 02:33:29 +00:00
self._render(" Language detected: {}".format(guessed_language), 2)
if guessed_language not in ISO639:
self._render("Language detection failed!", 0)
if settings.FORGIVING_OCR:
self._render(
"As FORGIVING_OCR is enabled, we're going to make the best "
"with what we have.",
1
)
return raw_text
raise OCRError
if ISO639[guessed_language] == self.DEFAULT_OCR_LANGUAGE:
2016-01-21 12:50:22 -05:00
return raw_text
2016-01-23 02:33:29 +00:00
try:
return self._ocr(pngs, ISO639[guessed_language])
except pyocr.pyocr.tesseract.TesseractError:
if settings.FORGIVING_OCR:
self._render(
"OCR for {} failed, but we're going to stick with what "
"we've got since FORGIVING_OCR is enabled.".format(
guessed_language
),
0
)
return raw_text
raise OCRError
2016-01-21 12:50:22 -05:00
def _ocr(self, pngs, lang):
2016-01-23 02:33:29 +00:00
self._render(" Parsing for {}".format(lang), 2)
2015-12-20 19:23:33 +00:00
r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
2016-01-01 16:13:59 +00:00
self._render(" {}".format(f.filename), 3)
2016-01-23 02:33:29 +00:00
r += self.OCR.image_to_string(f, lang=lang)
2015-12-20 19:23:33 +00:00
2016-01-28 07:23:11 +00:00
# Strip out excess white space to allow matching to go smoother
return re.sub(r"\s+", " ", r)
2015-12-20 19:23:33 +00:00
2016-01-01 16:13:59 +00:00
def _store(self, text, pdf):
2015-12-20 19:23:33 +00:00
sender, title = self._parse_file_name(pdf)
2016-01-28 07:23:11 +00:00
relevant_tags = [t for t in Tag.objects.all() if t.matches(text.lower())]
2015-12-26 13:21:33 +00:00
stats = os.stat(pdf)
2016-01-01 16:13:59 +00:00
self._render(" Saving record to database", 2)
2015-12-26 13:21:33 +00:00
doc = Document.objects.create(
2016-01-01 16:13:59 +00:00
sender=sender,
title=title,
content=text,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
2015-12-26 13:21:33 +00:00
)
2015-12-20 19:23:33 +00:00
2016-01-28 07:23:11 +00:00
if relevant_tags:
tag_names = ", ".join([t.slug for t in relevant_tags])
self._render(" Tagging with {}".format(tag_names), 2)
doc.tags.add(*relevant_tags)
2016-01-01 16:13:59 +00:00
with open(pdf, "rb") as unencrypted:
with open(doc.pdf_path, "wb") as encrypted:
self._render(" Encrypting", 3)
encrypted.write(GnuPG.encrypted(unencrypted))
2015-12-20 19:23:33 +00:00
def _parse_file_name(self, pdf):
"""
We use a crude naming convention to make handling the sender and title
easier:
"sender - title.pdf"
"""
# First we attempt "sender - title.pdf"
m = re.match(self.PARSER_REGEX_SENDER_TITLE, pdf)
if m:
sender_name, title = m.group(1), m.group(2)
sender, __ = Sender.objects.get_or_create(
name=sender_name, defaults={"slug": slugify(sender_name)})
return sender, title
# That didn't work, so we assume sender is None
m = re.match(self.PARSER_REGEX_TITLE, pdf)
return None, m.group(1)
2016-01-01 16:13:59 +00:00
def _cleanup(self, pngs, pdf):
2015-12-20 19:23:33 +00:00
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
2016-01-01 16:13:59 +00:00
for f in list(glob.glob(png_glob)) + [pdf]:
self._render(" Deleting {}".format(f), 2)
2015-12-20 19:23:33 +00:00
os.unlink(f)
2016-01-01 16:13:59 +00:00
self._render("", 2)
2016-01-01 16:13:59 +00:00
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)