paperless-ngx/src/documents/parsers.py

344 lines
10 KiB
Python
Raw Normal View History

import logging
import mimetypes
import os
import re
import shutil
import subprocess
import tempfile
2020-11-20 13:31:03 +01:00
import magic
from django.conf import settings
from django.utils import timezone
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
# - XX.YY.ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX/YY/ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX-YY-ZZZZ with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - ZZZZ.XX.YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - ZZZZ/XX/YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - ZZZZ-XX-YY with XX + YY being 1 or 2 and ZZZZ being 2 or 4 digits
# - XX. MONTH ZZZZ with XX being 1 or 2 and ZZZZ being 2 or 4 digits
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration
2020-11-12 21:09:45 +01:00
# TODO: isnt there a date parsing library for this?
DATE_REGEX = re.compile(
2020-11-12 21:09:45 +01:00
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|'
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
)
2021-02-05 01:10:29 +01:00
logger = logging.getLogger("paperless.parsing")
2020-11-20 13:31:03 +01:00
def is_mime_type_supported(mime_type):
return get_parser_class_for_mime_type(mime_type) is not None
def get_default_file_extension(mime_type):
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
if mime_type in supported_mime_types:
return supported_mime_types[mime_type]
2020-12-02 18:00:49 +01:00
ext = mimetypes.guess_extension(mime_type)
if ext:
return ext
else:
return ""
def is_file_ext_supported(ext):
if ext:
return ext.lower() in get_supported_file_extensions()
else:
return False
def get_supported_file_extensions():
extensions = set()
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
for mime_type in supported_mime_types:
extensions.update(mimetypes.guess_all_extensions(mime_type))
return extensions
2020-11-20 13:31:03 +01:00
def get_parser_class_for_mime_type(mime_type):
options = []
# Sein letzter Befehl war: KOMMT! Und sie kamen. Alle. Sogar die Parser.
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
2020-11-20 13:31:03 +01:00
supported_mime_types = parser_declaration["mime_types"]
2020-11-20 13:31:03 +01:00
if mime_type in supported_mime_types:
options.append(parser_declaration)
if not options:
return None
# Return the parser with the highest weight.
return sorted(
options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
2020-11-20 13:31:03 +01:00
def get_parser_class(path):
"""
Determine the appropriate parser class based on the file
"""
mime_type = magic.from_file(path, mime=True)
return get_parser_class_for_mime_type(mime_type)
2020-11-21 14:03:45 +01:00
def run_convert(input_file,
output_file,
density=None,
scale=None,
alpha=None,
strip=False,
trim=False,
type=None,
depth=None,
2020-12-30 15:12:16 +01:00
auto_orient=False,
2020-11-21 14:03:45 +01:00
extra=None,
logging_group=None):
environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT:
environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT
if settings.CONVERT_TMPDIR:
environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR
args = [settings.CONVERT_BINARY]
args += ['-density', str(density)] if density else []
args += ['-scale', str(scale)] if scale else []
args += ['-alpha', str(alpha)] if alpha else []
args += ['-strip'] if strip else []
args += ['-trim'] if trim else []
args += ['-type', str(type)] if type else []
args += ['-depth', str(depth)] if depth else []
2020-12-30 15:12:16 +01:00
args += ['-auto-orient'] if auto_orient else []
2020-11-12 21:09:45 +01:00
args += [input_file, output_file]
logger.debug("Execute: " + " ".join(args), extra={'group': logging_group})
if not subprocess.Popen(args, env=environment).wait() == 0:
raise ParseError("Convert failed at {}".format(args))
2021-03-14 14:42:48 +01:00
def get_default_thumbnail():
return os.path.join(os.path.dirname(__file__), "resources", "document.png")
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
out_path = os.path.join(temp_dir, "convert_gs.png")
# if convert fails, fall back to extracting
# the first PDF page as a PNG using Ghostscript
logger.warning(
"Thumbnail generation with ImageMagick failed, falling back "
"to ghostscript. Check your /etc/ImageMagick-x/policy.xml!",
extra={'group': logging_group}
)
gs_out_path = os.path.join(temp_dir, "gs_out.png")
cmd = [settings.GS_BINARY,
"-q",
"-sDEVICE=pngalpha",
"-o", gs_out_path,
in_path]
2021-01-01 21:50:45 +01:00
try:
2021-03-14 14:42:48 +01:00
if not subprocess.Popen(cmd).wait() == 0:
raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
# then run convert on the output from gs
2021-01-01 21:50:45 +01:00
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
2021-03-14 14:42:48 +01:00
input_file=gs_out_path,
2021-01-01 21:50:45 +01:00
output_file=out_path,
logging_group=logging_group)
2021-03-14 14:42:48 +01:00
return out_path
2021-01-01 21:50:45 +01:00
except ParseError:
2021-03-14 14:42:48 +01:00
return get_default_thumbnail()
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
"""
The thumbnail of a PDF is just a 500px wide image of the first page.
"""
out_path = os.path.join(temp_dir, "convert.png")
# Run convert to get a decent thumbnail
try:
2021-01-01 21:50:45 +01:00
run_convert(density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
2021-03-14 14:42:48 +01:00
input_file="{}[0]".format(in_path),
2021-01-01 21:50:45 +01:00
output_file=out_path,
logging_group=logging_group)
2021-03-14 14:42:48 +01:00
except ParseError:
out_path = make_thumbnail_from_pdf_gs_fallback(
in_path, temp_dir, logging_group)
2021-01-01 21:50:45 +01:00
return out_path
2021-01-02 15:26:09 +01:00
2020-11-25 19:36:18 +01:00
def parse_date(filename, text):
"""
Returns the date of the document.
"""
def __parser(ds, date_order):
"""
Call dateparser.parse with a particular date ordering
"""
2021-02-15 11:52:46 +01:00
import dateparser
2020-11-25 19:36:18 +01:00
return dateparser.parse(
ds,
settings={
"DATE_ORDER": date_order,
"PREFER_DAY_OF_MONTH": "first",
"RETURN_AS_TIMEZONE_AWARE":
True
}
)
def __filter(date):
if date and date.year > 1900 and \
date <= timezone.now() and \
date.date() not in settings.IGNORE_DATES:
return date
return None
2020-11-25 19:36:18 +01:00
date = None
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
for m in re.finditer(DATE_REGEX, filename):
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
2020-11-25 19:36:18 +01:00
return date
2020-11-25 19:36:18 +01:00
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
2020-11-25 19:36:18 +01:00
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
2020-11-25 19:36:18 +01:00
break
2020-11-25 19:36:18 +01:00
return date
class ParseError(Exception):
pass
class DocumentParser(LoggingMixin):
"""
Subclass this to make your own parser. Have a look at
`paperless_tesseract.parsers` for inspiration.
"""
2021-02-05 01:10:29 +01:00
logging_name = "paperless.parsing"
2021-01-26 15:19:56 +01:00
def __init__(self, logging_group, progress_callback=None):
super().__init__()
self.logging_group = logging_group
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
2020-11-21 14:03:45 +01:00
self.tempdir = tempfile.mkdtemp(
prefix="paperless-", dir=settings.SCRATCH_DIR)
2020-11-25 19:36:18 +01:00
self.archive_path = None
self.text = None
self.date = None
self.progress_callback = progress_callback
2021-02-02 23:58:25 +01:00
def progress(self, current_progress, max_progress):
2021-01-28 22:06:02 +01:00
if self.progress_callback:
2021-02-02 23:58:25 +01:00
self.progress_callback(current_progress, max_progress)
2021-01-28 22:06:02 +01:00
def extract_metadata(self, document_path, mime_type):
return []
2021-01-01 22:19:43 +01:00
def parse(self, document_path, mime_type, file_name=None):
2020-11-25 19:36:18 +01:00
raise NotImplementedError()
2020-11-25 14:47:01 +01:00
def get_archive_path(self):
2020-11-25 19:36:18 +01:00
return self.archive_path
2020-11-25 14:47:01 +01:00
def get_thumbnail(self, document_path, mime_type, file_name=None):
"""
Returns the path to a file we can use as a thumbnail for this document.
"""
raise NotImplementedError()
def get_optimised_thumbnail(self,
document_path,
mime_type,
file_name=None):
thumbnail = self.get_thumbnail(document_path, mime_type, file_name)
if settings.OPTIMIZE_THUMBNAILS:
2020-11-25 19:36:18 +01:00
out_path = os.path.join(self.tempdir, "thumb_optipng.png")
2020-11-21 14:03:45 +01:00
args = (settings.OPTIPNG_BINARY,
2020-11-25 19:36:18 +01:00
"-silent", "-o5", thumbnail, "-out", out_path)
2020-11-21 14:03:45 +01:00
self.log('debug', f"Execute: {' '.join(args)}")
if not subprocess.Popen(args).wait() == 0:
raise ParseError("Optipng failed at {}".format(args))
return out_path
else:
2020-11-25 19:36:18 +01:00
return thumbnail
def get_text(self):
2020-11-25 19:36:18 +01:00
return self.text
def get_date(self):
2020-11-25 19:36:18 +01:00
return self.date
def cleanup(self):
2021-02-05 01:10:29 +01:00
self.log("debug", f"Deleting directory {self.tempdir}")
shutil.rmtree(self.tempdir)