paperless-ngx/src/paperless_text/parsers.py

43 lines
1.2 KiB
Python
Raw Normal View History

2018-08-30 23:32:41 -04:00
import os
from django.conf import settings
from documents.parsers import DocumentParser
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
2018-08-30 23:32:41 -04:00
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
2022-03-19 14:58:41 +01:00
2018-08-30 23:32:41 -04:00
class TextDocumentParser(DocumentParser):
"""
This parser directly parses a text document (.txt, .md, or .csv)
2018-08-30 23:32:41 -04:00
"""
2021-02-05 01:10:29 +01:00
logging_name = "paperless.parsing.text"
def get_thumbnail(self, document_path, mime_type, file_name=None):
def read_text():
with open(document_path) as src:
2020-11-12 21:09:45 +01:00
lines = [line.strip() for line in src.readlines()]
2020-12-16 14:19:11 +01:00
text = "\n".join(lines[:50])
return text
2020-12-16 14:17:05 +01:00
img = Image.new("RGB", (500, 700), color="white")
draw = ImageDraw.Draw(img)
font = ImageFont.truetype(
font=settings.THUMBNAIL_FONT_NAME,
size=20,
2022-02-27 15:26:41 +01:00
layout_engine=ImageFont.LAYOUT_BASIC,
)
2020-12-16 14:17:05 +01:00
draw.text((5, 5), read_text(), font=font, fill="black")
2020-12-16 14:17:05 +01:00
out_path = os.path.join(self.tempdir, "thumb.png")
img.save(out_path)
return out_path
2018-08-30 23:32:41 -04:00
2021-01-01 22:19:43 +01:00
def parse(self, document_path, mime_type, file_name=None):
with open(document_path) as f:
2020-11-25 19:36:18 +01:00
self.text = f.read()