paperless-ngx/src/documents/management/commands/consume.py

190 lines
4.8 KiB
Python
Raw Normal View History

2015-12-26 13:21:33 +00:00
import datetime
2015-12-20 19:23:33 +00:00
import glob
2016-01-01 16:13:59 +00:00
import gnupg
2015-12-20 19:23:33 +00:00
import os
import random
import re
import subprocess
import time
2015-12-20 19:23:33 +00:00
import pyocr
from PIL import Image
from django.conf import settings
from django.core.management.base import BaseCommand
2015-12-26 13:21:33 +00:00
from django.utils import timezone
2015-12-20 19:23:33 +00:00
from documents.models import Document
class Command(BaseCommand):
"""
Loop over every file found in CONSUMPTION_DIR and:
1. Convert it to a greyscale tif
2. Convert it to a full-colour jpg
3. Use tesseract on the tif
4. Store the OCR'd text in the database along with the paths to the jpg
and original pdf
5. Delete the pdf and images
"""
2015-12-26 13:21:33 +00:00
LOOP_TIME = 10 # Seconds
2015-12-20 19:23:33 +00:00
CONVERT = settings.CONVERT_BINARY
SCRATCH = settings.SCRATCH_DIR
CONSUME = settings.CONSUMPTION_DIR
OCR = pyocr.get_available_tools()[0]
MEDIA_PDF = os.path.join(settings.MEDIA_ROOT, "documents", "pdf")
PARSER_REGEX = re.compile(r"^.*/(.*) - (.*)\.pdf$")
2015-12-20 19:23:33 +00:00
def __init__(self, *args, **kwargs):
self.verbosity = 0
self.stats = {}
2016-01-01 16:13:59 +00:00
self.gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
2015-12-20 19:23:33 +00:00
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options):
self.verbosity = options["verbosity"]
self._setup()
try:
while True:
self.loop()
2015-12-26 13:21:33 +00:00
time.sleep(self.LOOP_TIME)
if self.verbosity > 1:
print(".")
except KeyboardInterrupt:
print("Exiting")
def loop(self):
2015-12-20 19:23:33 +00:00
for pdf in os.listdir(self.CONSUME):
pdf = os.path.join(self.CONSUME, pdf)
if not os.path.isfile(pdf):
2015-12-20 19:23:33 +00:00
continue
if not pdf.endswith(".pdf"):
continue
if self._is_ready(pdf):
continue
2016-01-01 16:13:59 +00:00
self._render("Consuming {}".format(pdf), 1)
2015-12-20 19:23:33 +00:00
pngs = self._get_greyscale(pdf)
text = self._get_ocr(pngs)
2016-01-01 16:13:59 +00:00
self._store(text, pdf)
self._cleanup(pngs, pdf)
2015-12-20 19:23:33 +00:00
def _setup(self):
2016-01-01 16:13:59 +00:00
for d in (self.SCRATCH, self.MEDIA_PDF):
2015-12-20 19:23:33 +00:00
try:
os.makedirs(d)
except FileExistsError:
pass
def _is_ready(self, pdf):
"""
Detect whether `pdf` is ready to consume or if it's still being written
to by the scanner.
"""
t = os.stat(pdf).st_mtime
if self.stats.get(pdf) == t:
del(self.stats[pdf])
return True
self.stats[pdf] = t
return False
2015-12-20 19:23:33 +00:00
def _get_greyscale(self, pdf):
2016-01-01 16:13:59 +00:00
self._render(" Generating greyscale image", 2)
i = random.randint(1000000, 9999999)
2015-12-20 19:23:33 +00:00
png = os.path.join(self.SCRATCH, "{}.png".format(i))
subprocess.Popen((
self.CONVERT, "-density", "300", "-depth", "8",
"-type", "grayscale", pdf, png
)).wait()
return sorted(glob.glob(os.path.join(self.SCRATCH, "{}*".format(i))))
def _get_ocr(self, pngs):
2016-01-01 16:13:59 +00:00
self._render(" OCRing the PDF", 2)
2015-12-20 19:23:33 +00:00
r = ""
for png in pngs:
with Image.open(os.path.join(self.SCRATCH, png)) as f:
2016-01-01 16:13:59 +00:00
self._render(" {}".format(f.filename), 3)
2015-12-20 19:23:33 +00:00
r += self.OCR.image_to_string(f)
r += "\n\n\n\n\n\n\n\n"
return r
2016-01-01 16:13:59 +00:00
def _store(self, text, pdf):
2015-12-20 19:23:33 +00:00
sender, title = self._parse_file_name(pdf)
2015-12-26 13:21:33 +00:00
stats = os.stat(pdf)
2016-01-01 16:13:59 +00:00
self._render(" Saving record to database", 2)
2015-12-26 13:21:33 +00:00
doc = Document.objects.create(
2016-01-01 16:13:59 +00:00
sender=sender,
title=title,
content=text,
created=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime)),
modified=timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime))
2015-12-26 13:21:33 +00:00
)
2015-12-20 19:23:33 +00:00
2016-01-01 16:13:59 +00:00
with open(pdf, "rb") as unencrypted:
with open(doc.pdf_path, "wb") as encrypted:
self._render(" Encrypting", 3)
encrypted.write(self.gpg.encrypt_file(
unencrypted,
recipients=None,
passphrase=settings.PASSPHRASE,
symmetric=True
).data)
2015-12-20 19:23:33 +00:00
def _parse_file_name(self, pdf):
"""
We use a crude naming convention to make handling the sender and title
easier:
"sender - title.pdf"
"""
m = re.match(self.PARSER_REGEX, pdf)
if m:
return m.group(1), m.group(2)
return "", ""
2016-01-01 16:13:59 +00:00
def _cleanup(self, pngs, pdf):
2015-12-20 19:23:33 +00:00
png_glob = os.path.join(
self.SCRATCH, re.sub(r"^.*/(\d+)-\d+.png$", "\\1*", pngs[0]))
2016-01-01 16:13:59 +00:00
for f in list(glob.glob(png_glob)) + [pdf]:
self._render(" Deleting {}".format(f), 2)
2015-12-20 19:23:33 +00:00
os.unlink(f)
2016-01-01 16:13:59 +00:00
def _render(self, text, verbosity):
if self.verbosity >= verbosity:
print(text)