paperless-ngx/src/documents/models.py

437 lines
13 KiB
Python
Raw Normal View History

2018-01-06 17:23:07 +00:00
# coding=utf-8
2016-02-27 20:18:50 +00:00
import logging
2020-11-20 13:31:03 +01:00
import mimetypes
2016-01-01 16:13:59 +00:00
import os
2016-01-28 07:23:11 +00:00
import re
from collections import OrderedDict
2016-03-24 19:18:33 +00:00
2018-09-09 21:03:37 +01:00
import dateutil.parser
2016-01-01 16:13:59 +00:00
from django.conf import settings
2015-12-20 19:23:33 +00:00
from django.db import models
2015-12-26 13:20:52 +00:00
from django.utils import timezone
from django.utils.text import slugify
2015-12-20 19:23:33 +00:00
2018-09-09 21:03:37 +01:00
class MatchingModel(models.Model):
2016-01-28 07:23:11 +00:00
MATCH_ANY = 1
MATCH_ALL = 2
MATCH_LITERAL = 3
MATCH_REGEX = 4
MATCH_FUZZY = 5
MATCH_AUTO = 6
MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"),
(MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"),
(MATCH_FUZZY, "Fuzzy Match"),
(MATCH_AUTO, "Automatic Classification"),
)
name = models.CharField(max_length=128, unique=True)
slug = models.SlugField(blank=True, editable=False)
match = models.CharField(max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
choices=MATCHING_ALGORITHMS,
default=MATCH_ANY,
help_text=(
"Which algorithm you want to use when matching text to the OCR'd "
"PDF. Here, \"any\" looks for any occurrence of any word "
"provided in the PDF, while \"all\" requires that every word "
"provided appear in the PDF, albeit not in the order provided. A "
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
"uses a regex to match the PDF. (If you don't know what a regex "
"is, you probably don't want this option.) Finally, a \"fuzzy "
"match\" looks for words or phrases that are mostly—but not "
"exactly—the same, which can be useful for matching against "
"documents containg imperfections that foil accurate OCR."
)
2018-09-25 16:09:33 +02:00
)
2016-10-05 23:43:55 +02:00
is_insensitive = models.BooleanField(default=True)
2018-05-27 23:21:36 +01:00
class Meta:
abstract = True
ordering = ("name",)
def __str__(self):
return self.name
2016-01-28 07:23:11 +00:00
def save(self, *args, **kwargs):
self.match = self.match.lower()
self.slug = slugify(self.name)
models.Model.save(self, *args, **kwargs)
class Correspondent(MatchingModel):
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")
2018-05-27 23:21:36 +01:00
class Meta:
ordering = ("name",)
class Tag(MatchingModel):
COLOURS = (
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc")
)
colour = models.PositiveIntegerField(choices=COLOURS, default=1)
2016-01-23 04:40:35 +00:00
is_inbox_tag = models.BooleanField(
default=False,
2018-09-25 16:09:33 +02:00
help_text="Marks this tag as an inbox tag: All newly consumed "
"documents will be tagged with inbox tags."
)
2016-01-23 04:40:35 +00:00
2018-08-24 13:45:15 +02:00
class DocumentType(MatchingModel):
pass
2015-12-20 19:23:33 +00:00
class Document(models.Model):
2018-02-04 13:13:24 +00:00
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
STORAGE_TYPES = (
(STORAGE_TYPE_UNENCRYPTED, "Unencrypted"),
(STORAGE_TYPE_GPG, "Encrypted with GNU Privacy Guard")
)
2016-03-04 09:14:50 +00:00
correspondent = models.ForeignKey(
Correspondent,
blank=True,
null=True,
related_name="documents",
on_delete=models.SET_NULL
)
2017-03-11 16:37:30 +00:00
2015-12-20 19:23:33 +00:00
title = models.CharField(max_length=128, blank=True, db_index=True)
2017-03-11 16:37:30 +00:00
2018-08-24 13:45:15 +02:00
document_type = models.ForeignKey(
DocumentType,
blank=True,
null=True,
related_name="documents",
on_delete=models.SET_NULL
)
2017-03-11 16:37:30 +00:00
content = models.TextField(
blank=True,
2020-10-29 14:32:39 +01:00
help_text="The raw, text-only data of the document. This field is "
2017-03-11 16:37:30 +00:00
"primarily used for searching."
)
2020-11-20 13:31:03 +01:00
mime_type = models.CharField(
max_length=256,
editable=False
2016-01-29 23:18:03 +00:00
)
2017-03-11 16:37:30 +00:00
2016-02-08 23:46:16 +00:00
tags = models.ManyToManyField(
Tag, related_name="documents", blank=True)
checksum = models.CharField(
max_length=32,
editable=False,
unique=True,
help_text="The checksum of the original document (before it was "
"encrypted). We use this to prevent duplicate document "
"imports."
)
created = models.DateTimeField(
default=timezone.now, db_index=True)
modified = models.DateTimeField(
auto_now=True, editable=False, db_index=True)
2018-02-04 13:13:24 +00:00
storage_type = models.CharField(
max_length=11,
choices=STORAGE_TYPES,
default=STORAGE_TYPE_UNENCRYPTED,
2018-02-04 13:13:24 +00:00
editable=False
)
2015-12-26 13:20:52 +00:00
added = models.DateTimeField(
default=timezone.now, editable=False, db_index=True)
2015-12-26 13:20:52 +00:00
filename = models.FilePathField(
max_length=1024,
editable=False,
default=None,
null=True,
help_text="Current filename in storage"
)
archive_serial_number = models.IntegerField(
blank=True,
null=True,
unique=True,
db_index=True,
2018-09-25 16:09:33 +02:00
help_text="The position of this document in your physical document "
"archive."
)
2018-05-27 23:21:36 +01:00
class Meta:
2016-03-04 09:14:50 +00:00
ordering = ("correspondent", "title")
2015-12-26 13:20:52 +00:00
def __str__(self):
created = self.created.strftime("%Y%m%d%H%M%S")
2016-03-04 09:14:50 +00:00
if self.correspondent and self.title:
return "{}: {} - {}".format(
created, self.correspondent, self.title)
if self.correspondent or self.title:
return "{}: {}".format(created, self.correspondent or self.title)
2015-12-26 13:20:52 +00:00
return str(created)
2016-01-01 16:13:59 +00:00
@property
def source_path(self):
if self.filename:
fname = str(self.filename)
else:
2020-11-20 13:31:03 +01:00
fname = "{:07}{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
fname += ".gpg"
2016-01-01 16:13:59 +00:00
return os.path.join(
2020-10-26 00:35:24 +01:00
settings.ORIGINALS_DIR,
fname
2016-01-01 16:13:59 +00:00
)
@property
2016-01-29 23:18:03 +00:00
def source_file(self):
return open(self.source_path, "rb")
2020-11-25 14:47:01 +01:00
@property
def archive_path(self):
fname = "{:07}{}".format(self.pk, ".pdf")
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
@property
def archive_file(self):
return open(self.archive_path, "rb")
@property
def file_name(self):
2020-11-20 13:31:03 +01:00
return slugify(str(self)) + self.file_type
@property
def archive_file_name(self):
return slugify(str(self)) + ".pdf"
2020-11-20 13:31:03 +01:00
@property
def file_type(self):
return mimetypes.guess_extension(str(self.mime_type))
@property
def thumbnail_path(self):
2018-02-04 13:13:24 +00:00
file_name = "{:07}.png".format(self.pk)
if self.storage_type == self.STORAGE_TYPE_GPG:
file_name += ".gpg"
return os.path.join(
2020-10-26 00:35:24 +01:00
settings.THUMBNAIL_DIR,
2018-02-04 13:13:24 +00:00
file_name
)
@property
def thumbnail_file(self):
return open(self.thumbnail_path, "rb")
2016-02-27 20:18:50 +00:00
class Log(models.Model):
LEVELS = (
(logging.DEBUG, "Debugging"),
(logging.INFO, "Informational"),
(logging.WARNING, "Warning"),
(logging.ERROR, "Error"),
(logging.CRITICAL, "Critical"),
)
2020-11-02 01:24:56 +01:00
group = models.UUIDField(blank=True, null=True)
2016-02-27 20:18:50 +00:00
message = models.TextField()
level = models.PositiveIntegerField(choices=LEVELS, default=logging.INFO)
created = models.DateTimeField(auto_now_add=True)
2018-05-27 23:21:36 +01:00
class Meta:
2020-11-02 01:24:56 +01:00
ordering = ("-created",)
2016-02-27 20:18:50 +00:00
def __str__(self):
return self.message
2020-11-21 12:12:19 +01:00
# TODO: why is this in the models file?
2018-04-22 16:28:03 +01:00
class FileInfo:
2016-03-24 19:18:33 +00:00
# This epic regex *almost* worked for our needs, so I'm keeping it here for
# posterity, in the hopes that we might find a way to make it work one day.
ALMOST_REGEX = re.compile(
r"^((?P<date>\d\d\d\d\d\d\d\d\d\d\d\d\d\dZ){separator})?"
r"((?P<correspondent>{non_separated_word}+){separator})??"
r"(?P<title>{non_separated_word}+)"
r"({separator}(?P<tags>[a-z,0-9-]+))?"
r"\.(?P<extension>[a-zA-Z.-]+)$".format(
separator=r"\s+-\s+",
non_separated_word=r"([\w,. ]|([^\s]-))"
)
)
REGEXES = OrderedDict([
("created-correspondent-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)$",
2016-03-24 19:18:33 +00:00
flags=re.IGNORECASE
)),
("created-title-tags", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)$",
2016-03-24 19:18:33 +00:00
flags=re.IGNORECASE
)),
("created-correspondent-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<correspondent>.*) - "
r"(?P<title>.*)$",
2016-03-24 19:18:33 +00:00
flags=re.IGNORECASE
)),
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)$",
2016-03-24 19:18:33 +00:00
flags=re.IGNORECASE
)),
("correspondent-title-tags", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*) - "
r"(?P<tags>[a-z0-9\-,]*)$",
2016-03-24 19:18:33 +00:00
flags=re.IGNORECASE
)),
("correspondent-title", re.compile(
r"(?P<correspondent>.*) - "
r"(?P<title>.*)?$",
2016-03-24 19:18:33 +00:00
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)$",
2016-03-24 19:18:33 +00:00
flags=re.IGNORECASE
))
])
def __init__(self, created=None, correspondent=None, title=None, tags=(),
extension=None):
self.created = created
self.title = title
self.extension = extension
self.correspondent = correspondent
self.tags = tags
@classmethod
def _get_created(cls, created):
2018-04-22 16:27:43 +01:00
try:
return dateutil.parser.parse("{:0<14}Z".format(created[:-1]))
except ValueError:
return None
2016-03-24 19:18:33 +00:00
@classmethod
def _get_correspondent(cls, name):
if not name:
return None
return Correspondent.objects.get_or_create(name=name, defaults={
"slug": slugify(name)
})[0]
@classmethod
def _get_title(cls, title):
return title
@classmethod
def _get_tags(cls, tags):
r = []
for t in tags.split(","):
r.append(Tag.objects.get_or_create(
slug=slugify(t),
defaults={"name": t}
)[0])
2016-03-24 19:18:33 +00:00
return tuple(r)
@classmethod
def _mangle_property(cls, properties, name):
if name in properties:
properties[name] = getattr(cls, "_get_{}".format(name))(
properties[name]
)
@classmethod
def from_filename(cls, filename):
2016-03-24 19:18:33 +00:00
"""
We use a crude naming convention to make handling the correspondent,
title, and tags easier:
"<date> - <correspondent> - <title> - <tags>"
"<correspondent> - <title> - <tags>"
"<correspondent> - <title>"
"<title>"
2016-03-24 19:18:33 +00:00
"""
# Mutate filename in-place before parsing its components
# by applying at most one of the configured transformations.
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
(filename, count) = pattern.subn(repl, filename)
if count:
break
# do this after the transforms so that the transforms can do whatever
# with the file extension.
filename_no_ext = os.path.splitext(filename)[0]
if filename_no_ext == filename and filename.startswith("."):
# This is a very special case where there is no text before the
# file type.
# TODO: this should be handled better. The ext is not removed
# because usually, files like '.pdf' are just hidden files
# with the name pdf, but in our case, its more likely that
# there's just no name to begin with.
filename = ""
# This isn't too bad either, since we'll just not match anything
# and return an empty title. TODO: actually, this is kinda bad.
else:
filename = filename_no_ext
# Parse filename components.
2016-03-24 19:18:33 +00:00
for regex in cls.REGEXES.values():
m = regex.match(filename)
2016-03-24 19:18:33 +00:00
if m:
properties = m.groupdict()
cls._mangle_property(properties, "created")
cls._mangle_property(properties, "correspondent")
cls._mangle_property(properties, "title")
cls._mangle_property(properties, "tags")
return cls(**properties)