paperless-ngx/src/paperless_mail/parsers.py

428 lines
15 KiB
Python
Raw Normal View History

2022-04-29 22:58:11 +02:00
import re
from html import escape
from pathlib import Path
from typing import Optional
2022-04-29 22:58:11 +02:00
from bleach import clean
from bleach import linkify
from django.conf import settings
from django.utils.timezone import is_naive
from django.utils.timezone import make_aware
from gotenberg_client import GotenbergClient
from gotenberg_client.options import Margin
from gotenberg_client.options import PageSize
from gotenberg_client.options import PdfAFormat
from humanize import naturalsize
from imap_tools import MailAttachment
2022-04-29 22:58:11 +02:00
from imap_tools import MailMessage
from tika_client import TikaClient
2022-04-29 22:58:11 +02:00
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
2022-04-29 22:58:11 +02:00
class MailDocumentParser(DocumentParser):
"""
This parser uses imap_tools to parse .eml files, generates pdf using
Gotenberg and sends the html part to a Tika server for text extraction.
2022-04-29 22:58:11 +02:00
"""
logging_name = "paperless.parsing.mail"
@staticmethod
def _settings_to_gotenberg_pdfa() -> Optional[PdfAFormat]:
"""
Converts our requested PDF/A output into the Gotenberg API
format
"""
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == "pdfa-1": # pragma: no cover
return PdfAFormat.A1a
elif settings.OCR_OUTPUT_TYPE == "pdfa-3": # pragma: no cover
return PdfAFormat.A3b
return None
def get_thumbnail(self, document_path: Path, mime_type: str, file_name=None):
2022-04-29 22:58:11 +02:00
if not self.archive_path:
self.archive_path = self.generate_pdf(
self.parse_file_to_message(document_path),
)
2022-04-29 22:58:11 +02:00
return make_thumbnail_from_pdf(
self.archive_path,
self.tempdir,
self.logging_group,
)
def extract_metadata(self, document_path: Path, mime_type: str):
2022-04-29 22:58:11 +02:00
result = []
try:
mail = self.parse_file_to_message(document_path)
2022-04-29 22:58:11 +02:00
except ParseError as e:
self.log.warning(
f"Error while fetching document metadata for {document_path}: {e}",
2022-04-29 22:58:11 +02:00
)
return result
for key, value in mail.headers.items():
value = ", ".join(i for i in value)
try:
value.encode("utf-8")
except UnicodeEncodeError as e: # pragma: no cover
self.log.debug(f"Skipping header {key}: {e}")
continue
2022-04-29 22:58:11 +02:00
result.append(
{
"namespace": "",
"prefix": "header",
"key": key,
"value": value,
},
)
result.append(
{
"namespace": "",
"prefix": "",
"key": "attachments",
"value": ", ".join(
f"{attachment.filename}"
f"({naturalsize(attachment.size, binary=True, format='%.2f')})"
2022-04-29 22:58:11 +02:00
for attachment in mail.attachments
),
},
)
result.append(
{
"namespace": "",
"prefix": "",
"key": "date",
"value": mail.date.strftime("%Y-%m-%d %H:%M:%S %Z"),
},
)
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
def parse(self, document_path: Path, mime_type: str, file_name=None):
"""
Parses the given .eml into formatted text, based on the decoded email.
"""
def strip_text(text: str):
"""
Reduces the spacing of the given text string
"""
2022-11-20 12:48:03 +01:00
text = re.sub(r"\s+", " ", text)
text = re.sub(r"(\n *)+", "\n", text)
return text.strip()
2022-04-29 22:58:11 +02:00
def build_formatted_text(mail_message: MailMessage) -> str:
"""
Constructs a formatted string, based on the given email. Basically tries
to get most of the email content, included front matter, into a nice string
"""
fmt_text = f"Subject: {mail_message.subject}\n\n"
fmt_text += f"From: {mail_message.from_values.full}\n\n"
to_list = [address.full for address in mail_message.to_values]
fmt_text += f"To: {', '.join(to_list)}\n\n"
if mail_message.cc_values:
fmt_text += (
f"CC: {', '.join(address.full for address in mail.cc_values)}\n\n"
)
if mail_message.bcc_values:
fmt_text += (
f"BCC: {', '.join(address.full for address in mail.bcc_values)}\n\n"
)
if mail_message.attachments:
att = []
for a in mail.attachments:
attachment_size = naturalsize(a.size, binary=True, format="%.2f")
att.append(
f"{a.filename} ({attachment_size})",
)
fmt_text += f"Attachments: {', '.join(att)}\n\n"
if mail.html:
fmt_text += "HTML content: " + strip_text(self.tika_parse(mail.html))
2022-04-29 22:58:11 +02:00
fmt_text += f"\n\n{strip_text(mail.text)}"
return fmt_text
self.log.debug(f"Parsing file {document_path.name} into an email")
mail = self.parse_file_to_message(document_path)
2022-04-29 22:58:11 +02:00
self.log.debug("Building formatted text from email")
self.text = build_formatted_text(mail)
if is_naive(mail.date):
self.date = make_aware(mail.date)
else:
self.date = mail.date
self.log.debug("Creating a PDF from the email")
self.archive_path = self.generate_pdf(mail)
@staticmethod
def parse_file_to_message(filepath: Path) -> MailMessage:
"""
Parses the given .eml file into a MailMessage object
"""
try:
with filepath.open("rb") as eml:
parsed = MailMessage.from_bytes(eml.read())
if parsed.from_values is None:
raise ParseError(
f"Could not parse {filepath}: Missing 'from'",
)
except Exception as err:
raise ParseError(
f"Could not parse {filepath}: {err}",
) from err
return parsed
2022-04-29 22:58:11 +02:00
def tika_parse(self, html: str):
self.log.info("Sending content to Tika server")
2022-05-03 18:02:08 +02:00
try:
with TikaClient(tika_url=settings.TIKA_ENDPOINT) as client:
parsed = client.tika.as_text.from_buffer(html, "text/html")
if parsed.content is not None:
return parsed.content.strip()
return ""
2022-05-03 18:02:08 +02:00
except Exception as err:
raise ParseError(
2022-10-15 13:13:29 +02:00
f"Could not parse content with tika server at "
f"{settings.TIKA_ENDPOINT}: {err}",
) from err
2022-05-03 18:02:08 +02:00
def generate_pdf(self, mail_message: MailMessage) -> Path:
archive_path = Path(self.tempdir) / "merged.pdf"
mail_pdf_file = self.generate_pdf_from_mail(mail_message)
# If no HTML content, create the PDF from the message
# Otherwise, create 2 PDFs and merge them with Gotenberg
if not mail_message.html:
archive_path.write_bytes(mail_pdf_file.read_bytes())
2022-11-20 14:22:30 +01:00
else:
pdf_of_html_content = self.generate_pdf_from_html(
mail_message.html,
mail_message.attachments,
)
self.log.debug("Merging email text and HTML content into single PDF")
with (
GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client,
client.merge.merge() as route,
):
# Configure requested PDF/A formatting, if any
pdf_a_format = self._settings_to_gotenberg_pdfa()
if pdf_a_format is not None:
route.pdf_format(pdf_a_format)
route.merge([mail_pdf_file, pdf_of_html_content])
try:
response = route.run()
archive_path.write_bytes(response.content)
except Exception as err:
raise ParseError(
f"Error while merging email HTML into PDF: {err}",
) from err
return archive_path
def mail_to_html(self, mail: MailMessage) -> Path:
"""
Converts the given email into an HTML file, formatted
based on the given template
"""
def clean_html(text: str) -> str:
"""
Attempts to clean, escape and linkify the given HTML string
"""
2022-04-29 22:58:11 +02:00
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if not isinstance(text, str):
2022-04-29 22:58:11 +02:00
text = str(text)
text = escape(text)
2022-04-29 22:58:11 +02:00
text = clean(text)
text = linkify(text, parse_email=True)
text = text.replace("\n", "<br>")
return text
data = {}
2022-04-29 22:58:11 +02:00
data["subject"] = clean_html(mail.subject)
if data["subject"]:
2022-04-29 22:58:11 +02:00
data["subject_label"] = "Subject"
data["from"] = clean_html(mail.from_values.full)
if data["from"]:
2022-04-29 22:58:11 +02:00
data["from_label"] = "From"
data["to"] = clean_html(", ".join(address.full for address in mail.to_values))
if data["to"]:
2022-04-29 22:58:11 +02:00
data["to_label"] = "To"
data["cc"] = clean_html(", ".join(address.full for address in mail.cc_values))
if data["cc"]:
2022-04-29 22:58:11 +02:00
data["cc_label"] = "CC"
data["bcc"] = clean_html(", ".join(address.full for address in mail.bcc_values))
if data["bcc"]:
2022-04-29 22:58:11 +02:00
data["bcc_label"] = "BCC"
2022-05-05 00:20:32 +02:00
att = []
for a in mail.attachments:
att.append(
f"{a.filename} ({naturalsize(a.size, binary=True, format='%.2f')})",
)
2022-05-05 00:20:32 +02:00
data["attachments"] = clean_html(", ".join(att))
if data["attachments"]:
2022-05-05 00:20:32 +02:00
data["attachments_label"] = "Attachments"
2022-04-29 22:58:11 +02:00
data["date"] = clean_html(mail.date.astimezone().strftime("%Y-%m-%d %H:%M"))
data["content"] = clean_html(mail.text.strip())
2022-11-20 09:13:08 -08:00
from django.template.loader import render_to_string
html_file = Path(self.tempdir) / "email_as_html.html"
html_file.write_text(render_to_string("email_msg_template.html", context=data))
2022-04-29 22:58:11 +02:00
return html_file
def generate_pdf_from_mail(self, mail: MailMessage) -> Path:
"""
Creates a PDF based on the given email, using the email's values in a
an HTML template
"""
self.log.info("Converting mail to PDF")
2022-04-29 22:58:11 +02:00
css_file = Path(__file__).parent / "templates" / "output.css"
email_html_file = self.mail_to_html(mail)
with (
GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client,
client.chromium.html_to_pdf() as route,
):
# Configure requested PDF/A formatting, if any
pdf_a_format = self._settings_to_gotenberg_pdfa()
if pdf_a_format is not None:
route.pdf_format(pdf_a_format)
try:
response = (
route.index(email_html_file)
.resource(css_file)
.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1))
.size(PageSize(height=11.7, width=8.27))
.scale(1.0)
.run()
)
except Exception as err:
raise ParseError(
f"Error while converting email to PDF: {err}",
) from err
email_as_pdf_file = Path(self.tempdir) / "email_as_pdf.pdf"
email_as_pdf_file.write_bytes(response.content)
return email_as_pdf_file
def generate_pdf_from_html(
self,
orig_html: str,
attachments: list[MailAttachment],
) -> Path:
"""
Generates a PDF file based on the HTML and attachments of the email
"""
def clean_html_script(text: str):
2022-10-18 23:48:07 +02:00
compiled_open = re.compile(re.escape("<script"), re.IGNORECASE)
text = compiled_open.sub("<div hidden ", text)
compiled_close = re.compile(re.escape("</script"), re.IGNORECASE)
text = compiled_close.sub("</div", text)
return text
self.log.info("Converting message html to PDF")
tempdir = Path(self.tempdir)
html_clean = clean_html_script(orig_html)
html_clean_file = tempdir / "index.html"
html_clean_file.write_text(html_clean)
with (
GotenbergClient(
host=settings.TIKA_GOTENBERG_ENDPOINT,
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client,
client.chromium.html_to_pdf() as route,
):
# Configure requested PDF/A formatting, if any
pdf_a_format = self._settings_to_gotenberg_pdfa()
if pdf_a_format is not None:
route.pdf_format(pdf_a_format)
# Add attachments as resources, cleaning the filename and replacing
# it in the index file for inclusion
for attachment in attachments:
# Clean the attachment name to be valid
name_cid = f"cid:{attachment.content_id}"
name_clean = "".join(e for e in name_cid if e.isalnum())
# Write attachment payload to a temp file
temp_file = tempdir / name_clean
temp_file.write_bytes(attachment.payload)
route.resource(temp_file)
# Replace as needed the name with the clean name
html_clean = html_clean.replace(name_cid, name_clean)
# Now store the cleaned up HTML version
html_clean_file = tempdir / "index.html"
html_clean_file.write_text(html_clean)
# This is our index file, the main page basically
route.index(html_clean_file)
# Set page size, margins
route.margins(Margin(top=0.1, bottom=0.1, left=0.1, right=0.1)).size(
PageSize(height=11.7, width=8.27),
).scale(1.0)
try:
response = route.run()
except Exception as err:
raise ParseError(
f"Error while converting document to PDF: {err}",
) from err
html_pdf = tempdir / "html.pdf"
html_pdf.write_bytes(response.content)
return html_pdf
Feature: Allow setting backend configuration settings via the UI (#5126) * Saving some start on this * At least partially working for the tesseract parser * Problems with migration testing need to figure out * Work around that error * Fixes max m_pixels * Moving the settings to main paperless application * Starting some consumer options * More fixes and work * Fixes these last tests * Fix max_length on OcrSettings.mode field * Fix all fields on Common & Ocr settings serializers * Umbrellla config view * Revert "Umbrellla config view" This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5. * Updates to use a single configuration object for all settings * Squashed commit of the following: commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 23:02:47 2023 -0800 Fix formatting commit 66b2d90c507b8afd9507813ff555e46198ea33b9 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 22:36:35 2023 -0800 Refactor frontend data models commit 5723bd8dd823ee855625e250df39393e26709d48 Author: Adam Bogdał <adam@bogdal.pl> Date: Wed Dec 20 01:17:43 2023 +0100 Fix: speed up admin panel for installs with a large number of documents (#5052) commit 9b08ce176199bf9011a6634bb88f616846150d2b Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:18:51 2023 -0800 Update PULL_REQUEST_TEMPLATE.md commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 15:02:05 2023 -0800 Chore: Update Angular to v17 (#4980) commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:53:56 2023 -0800 Fix: Dont allow null custom_fields property via API (#5063) commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1 Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 13:43:50 2023 -0800 Enhancement: symmetric document links (#4907) commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec Author: shamoon <4887959+shamoon@users.noreply.github.com> Date: Tue Dec 19 12:45:04 2023 -0800 Enhancement: shared icon & shared by me filter (#4859) commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5 Author: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue Dec 19 12:04:03 2023 -0800 Bulk updates all the backend libraries (#5061) * Saving some work on frontend config * Very basic but dynamically-generated config form * Saving work on slightly less ugly frontend config * JSON validation for user_args field * Fully dynamic config form * Adds in some additional validators for a nicer error message * Cleaning up the testing and coverage more * Reverts unintentional change * Adds documentation about the settings and the precedence * Couple more commenting and style fixes --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2023-12-29 15:42:56 -08:00
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None