2018-08-30 23:32:41 -04:00
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
from django.conf import settings
|
2020-12-29 12:26:41 +01:00
|
|
|
from documents.parsers import DocumentParser
|
2022-03-11 10:55:51 -08:00
|
|
|
from PIL import Image
|
|
|
|
|
from PIL import ImageDraw
|
|
|
|
|
from PIL import ImageFont
|
2018-08-30 23:32:41 -04:00
|
|
|
|
2022-03-21 22:27:32 +01:00
|
|
|
Image.MAX_IMAGE_PIXELS = settings.OCR_MAX_IMAGE_PIXELS
|
2022-03-19 14:58:41 +01:00
|
|
|
|
2018-08-30 23:32:41 -04:00
|
|
|
|
|
|
|
|
class TextDocumentParser(DocumentParser):
|
|
|
|
|
"""
|
2018-09-03 23:46:13 -04:00
|
|
|
This parser directly parses a text document (.txt, .md, or .csv)
|
2018-08-30 23:32:41 -04:00
|
|
|
"""
|
|
|
|
|
|
2021-02-05 01:10:29 +01:00
|
|
|
logging_name = "paperless.parsing.text"
|
|
|
|
|
|
2021-02-09 22:12:43 +01:00
|
|
|
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
2018-09-03 23:46:13 -04:00
|
|
|
def read_text():
|
2022-05-06 09:04:08 -07:00
|
|
|
with open(document_path) as src:
|
2020-11-12 21:09:45 +01:00
|
|
|
lines = [line.strip() for line in src.readlines()]
|
2020-12-16 14:19:11 +01:00
|
|
|
text = "\n".join(lines[:50])
|
|
|
|
|
return text
|
2018-09-03 23:46:13 -04:00
|
|
|
|
2020-12-16 14:17:05 +01:00
|
|
|
img = Image.new("RGB", (500, 700), color="white")
|
|
|
|
|
draw = ImageDraw.Draw(img)
|
|
|
|
|
font = ImageFont.truetype(
|
2020-12-29 12:26:41 +01:00
|
|
|
font=settings.THUMBNAIL_FONT_NAME,
|
|
|
|
|
size=20,
|
2022-02-27 15:26:41 +01:00
|
|
|
layout_engine=ImageFont.LAYOUT_BASIC,
|
|
|
|
|
)
|
2020-12-16 14:17:05 +01:00
|
|
|
draw.text((5, 5), read_text(), font=font, fill="black")
|
2018-09-03 23:46:13 -04:00
|
|
|
|
2020-12-16 14:17:05 +01:00
|
|
|
out_path = os.path.join(self.tempdir, "thumb.png")
|
|
|
|
|
img.save(out_path)
|
2018-10-07 14:56:38 +01:00
|
|
|
|
|
|
|
|
return out_path
|
2018-08-30 23:32:41 -04:00
|
|
|
|
2021-01-01 22:19:43 +01:00
|
|
|
def parse(self, document_path, mime_type, file_name=None):
|
2022-05-06 09:04:08 -07:00
|
|
|
with open(document_path) as f:
|
2020-11-25 19:36:18 +01:00
|
|
|
self.text = f.read()
|