paperless-ngx/src/paperless_text/parsers.py

50 lines
1.6 KiB
Python
Raw Normal View History

2018-08-30 23:32:41 -04:00
import os
import subprocess
2020-12-16 14:17:05 +01:00
from PIL import ImageDraw, ImageFont, Image
2018-08-30 23:32:41 -04:00
from django.conf import settings
from documents.parsers import DocumentParser, ParseError
2018-08-30 23:32:41 -04:00
class TextDocumentParser(DocumentParser):
"""
This parser directly parses a text document (.txt, .md, or .csv)
2018-08-30 23:32:41 -04:00
"""
2020-11-25 19:36:18 +01:00
def get_thumbnail(self, document_path, mime_type):
def read_text():
2020-11-25 19:36:18 +01:00
with open(document_path, 'r') as src:
2020-11-12 21:09:45 +01:00
lines = [line.strip() for line in src.readlines()]
2020-12-16 14:17:05 +01:00
text = "\n".join([line for line in lines[:50]])
return text.replace('"', "'")
2020-12-16 14:17:05 +01:00
img = Image.new("RGB", (500, 700), color="white")
draw = ImageDraw.Draw(img)
font = ImageFont.truetype(
"/usr/share/fonts/liberation/LiberationSerif-Regular.ttf", 20,
layout_engine=ImageFont.LAYOUT_BASIC)
draw.text((5, 5), read_text(), font=font, fill="black")
2020-12-16 14:17:05 +01:00
out_path = os.path.join(self.tempdir, "thumb.png")
img.save(out_path)
return out_path
2018-08-30 23:32:41 -04:00
2020-11-25 19:36:18 +01:00
def parse(self, document_path, mime_type):
with open(document_path, 'r') as f:
self.text = f.read()
2018-08-30 23:32:41 -04:00
def run_command(*args):
2018-08-30 23:32:41 -04:00
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
if not subprocess.Popen(' '.join(args), env=environment,
shell=True).wait() == 0:
2018-09-09 20:55:37 +01:00
raise ParseError("Convert failed at {}".format(args))