paperless-ngx/src/documents/models.py

490 lines
13 KiB
Python
Raw Normal View History

2018-01-06 17:23:07 +00:00
# coding=utf-8
import datetime
2016-02-27 20:18:50 +00:00
import logging
2016-01-01 16:13:59 +00:00
import os
2016-01-28 07:23:11 +00:00
import re
from collections import OrderedDict
2016-03-24 19:18:33 +00:00
import pathvalidate
2018-09-09 21:03:37 +01:00
import dateutil.parser
2016-01-01 16:13:59 +00:00
from django.conf import settings
2020-12-12 15:46:56 +01:00
from django.contrib.auth.models import User
2015-12-20 19:23:33 +00:00
from django.db import models
2015-12-26 13:20:52 +00:00
from django.utils import timezone
2021-01-12 13:05:49 +01:00
from django.utils.timezone import is_aware
2015-12-20 19:23:33 +00:00
2020-12-30 21:48:34 +01:00
from django.utils.translation import gettext_lazy as _
2015-12-20 19:23:33 +00:00
from documents.parsers import get_default_file_extension
2018-09-09 21:03:37 +01:00
class MatchingModel(models.Model):
2016-01-28 07:23:11 +00:00
MATCH_ANY = 1
MATCH_ALL = 2
MATCH_LITERAL = 3
MATCH_REGEX = 4
MATCH_FUZZY = 5
MATCH_AUTO = 6
MATCHING_ALGORITHMS = (
2021-01-02 00:45:23 +01:00
(MATCH_ANY, _("Any word")),
(MATCH_ALL, _("All words")),
(MATCH_LITERAL, _("Exact match")),
(MATCH_REGEX, _("Regular expression")),
(MATCH_FUZZY, _("Fuzzy word")),
(MATCH_AUTO, _("Automatic")),
)
2020-12-30 21:48:34 +01:00
name = models.CharField(
_("name"),
max_length=128, unique=True)
match = models.CharField(
_("match"),
max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
2020-12-30 21:48:34 +01:00
_("matching algorithm"),
choices=MATCHING_ALGORITHMS,
2020-12-30 21:48:34 +01:00
default=MATCH_ANY
2018-09-25 16:09:33 +02:00
)
2016-10-05 23:43:55 +02:00
2020-12-30 21:48:34 +01:00
is_insensitive = models.BooleanField(
_("is insensitive"),
default=True)
2018-05-27 23:21:36 +01:00
class Meta:
abstract = True
ordering = ("name",)
def __str__(self):
return self.name
class Correspondent(MatchingModel):
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")
2018-05-27 23:21:36 +01:00
class Meta:
ordering = ("name",)
2020-12-30 21:48:34 +01:00
verbose_name = _("correspondent")
verbose_name_plural = _("correspondents")
class Tag(MatchingModel):
2021-02-24 23:52:25 +01:00
color = models.CharField(
2020-12-30 21:48:34 +01:00
_("color"),
2021-02-25 22:16:31 +01:00
max_length=7,
default="#a6cee3"
)
2016-01-23 04:40:35 +00:00
is_inbox_tag = models.BooleanField(
2020-12-30 21:48:34 +01:00
_("is inbox tag"),
default=False,
2020-12-30 21:48:34 +01:00
help_text=_("Marks this tag as an inbox tag: All newly consumed "
"documents will be tagged with inbox tags.")
2018-09-25 16:09:33 +02:00
)
2020-12-30 21:48:34 +01:00
class Meta:
verbose_name = _("tag")
verbose_name_plural = _("tags")
2016-01-23 04:40:35 +00:00
2018-08-24 13:45:15 +02:00
class DocumentType(MatchingModel):
2020-12-30 21:48:34 +01:00
class Meta:
verbose_name = _("document type")
verbose_name_plural = _("document types")
2018-08-24 13:45:15 +02:00
2015-12-20 19:23:33 +00:00
class Document(models.Model):
2018-02-04 13:13:24 +00:00
STORAGE_TYPE_UNENCRYPTED = "unencrypted"
STORAGE_TYPE_GPG = "gpg"
STORAGE_TYPES = (
2020-12-30 21:48:34 +01:00
(STORAGE_TYPE_UNENCRYPTED, _("Unencrypted")),
(STORAGE_TYPE_GPG, _("Encrypted with GNU Privacy Guard"))
2018-02-04 13:13:24 +00:00
)
2016-03-04 09:14:50 +00:00
correspondent = models.ForeignKey(
Correspondent,
blank=True,
null=True,
related_name="documents",
2020-12-30 21:48:34 +01:00
on_delete=models.SET_NULL,
verbose_name=_("correspondent")
)
2017-03-11 16:37:30 +00:00
2020-12-30 21:48:34 +01:00
title = models.CharField(
_("title"),
max_length=128, blank=True, db_index=True)
2017-03-11 16:37:30 +00:00
2018-08-24 13:45:15 +02:00
document_type = models.ForeignKey(
DocumentType,
blank=True,
null=True,
related_name="documents",
2020-12-30 21:48:34 +01:00
on_delete=models.SET_NULL,
verbose_name=_("document type")
2018-08-24 13:45:15 +02:00
)
2017-03-11 16:37:30 +00:00
content = models.TextField(
2020-12-30 21:48:34 +01:00
_("content"),
2017-03-11 16:37:30 +00:00
blank=True,
2020-12-30 21:48:34 +01:00
help_text=_("The raw, text-only data of the document. This field is "
"primarily used for searching.")
2017-03-11 16:37:30 +00:00
)
2020-11-20 13:31:03 +01:00
mime_type = models.CharField(
2020-12-30 21:48:34 +01:00
_("mime type"),
2020-11-20 13:31:03 +01:00
max_length=256,
editable=False
2016-01-29 23:18:03 +00:00
)
2017-03-11 16:37:30 +00:00
2016-02-08 23:46:16 +00:00
tags = models.ManyToManyField(
2020-12-30 21:48:34 +01:00
Tag, related_name="documents", blank=True,
verbose_name=_("tags")
)
checksum = models.CharField(
2020-12-30 21:48:34 +01:00
_("checksum"),
max_length=32,
editable=False,
unique=True,
2020-12-30 21:48:34 +01:00
help_text=_("The checksum of the original document.")
)
archive_checksum = models.CharField(
2020-12-30 21:48:34 +01:00
_("archive checksum"),
max_length=32,
editable=False,
blank=True,
null=True,
2020-12-30 21:48:34 +01:00
help_text=_("The checksum of the archived document.")
)
created = models.DateTimeField(
2020-12-30 21:48:34 +01:00
_("created"),
default=timezone.now, db_index=True)
2020-12-07 21:51:00 +01:00
modified = models.DateTimeField(
2020-12-30 21:48:34 +01:00
_("modified"),
auto_now=True, editable=False, db_index=True)
2018-02-04 13:13:24 +00:00
storage_type = models.CharField(
2020-12-30 21:48:34 +01:00
_("storage type"),
2018-02-04 13:13:24 +00:00
max_length=11,
choices=STORAGE_TYPES,
default=STORAGE_TYPE_UNENCRYPTED,
2018-02-04 13:13:24 +00:00
editable=False
)
2015-12-26 13:20:52 +00:00
added = models.DateTimeField(
2020-12-30 21:48:34 +01:00
_("added"),
default=timezone.now, editable=False, db_index=True)
2015-12-26 13:20:52 +00:00
filename = models.FilePathField(
2020-12-30 21:48:34 +01:00
_("filename"),
max_length=1024,
editable=False,
default=None,
unique=True,
null=True,
2020-12-30 21:48:34 +01:00
help_text=_("Current filename in storage")
)
archive_filename = models.FilePathField(
_("archive filename"),
max_length=1024,
editable=False,
default=None,
unique=True,
null=True,
help_text=_("Current archive filename in storage")
)
archive_serial_number = models.IntegerField(
2020-12-30 21:48:34 +01:00
_("archive serial number"),
blank=True,
null=True,
unique=True,
db_index=True,
2020-12-30 21:48:34 +01:00
help_text=_("The position of this document in your physical document "
"archive.")
2018-09-25 16:09:33 +02:00
)
2018-05-27 23:21:36 +01:00
class Meta:
ordering = ("-created",)
2020-12-31 15:59:12 +01:00
verbose_name = _("document")
verbose_name_plural = _("documents")
2015-12-26 13:20:52 +00:00
def __str__(self):
2021-01-12 13:05:49 +01:00
if is_aware(self.created):
created = timezone.localdate(self.created).isoformat()
else:
created = datetime.date.isoformat(self.created)
2016-03-04 09:14:50 +00:00
if self.correspondent and self.title:
return f"{created} {self.correspondent} {self.title}"
else:
return f"{created} {self.title}"
2016-01-01 16:13:59 +00:00
@property
def source_path(self):
if self.filename:
fname = str(self.filename)
else:
2020-11-20 13:31:03 +01:00
fname = "{:07}{}".format(self.pk, self.file_type)
if self.storage_type == self.STORAGE_TYPE_GPG:
2020-12-15 13:47:43 +01:00
fname += ".gpg" # pragma: no cover
2016-01-01 16:13:59 +00:00
return os.path.join(
2020-10-26 00:35:24 +01:00
settings.ORIGINALS_DIR,
fname
2016-01-01 16:13:59 +00:00
)
@property
2016-01-29 23:18:03 +00:00
def source_file(self):
return open(self.source_path, "rb")
@property
def has_archive_version(self):
return self.archive_filename is not None
2020-11-25 14:47:01 +01:00
@property
def archive_path(self):
if self.has_archive_version:
return os.path.join(
settings.ARCHIVE_DIR,
str(self.archive_filename)
)
2020-11-30 21:38:21 +01:00
else:
return None
2020-11-25 14:47:01 +01:00
@property
def archive_file(self):
return open(self.archive_path, "rb")
def get_public_filename(self, archive=False, counter=0, suffix=None):
result = str(self)
2020-11-20 13:31:03 +01:00
if counter:
result += f"_{counter:02}"
if suffix:
result += suffix
if archive:
result += ".pdf"
else:
result += self.file_type
return pathvalidate.sanitize_filename(result, replacement_text="-")
2020-11-20 13:31:03 +01:00
@property
def file_type(self):
return get_default_file_extension(self.mime_type)
@property
def thumbnail_path(self):
2018-02-04 13:13:24 +00:00
file_name = "{:07}.png".format(self.pk)
if self.storage_type == self.STORAGE_TYPE_GPG:
file_name += ".gpg"
return os.path.join(
2020-10-26 00:35:24 +01:00
settings.THUMBNAIL_DIR,
2018-02-04 13:13:24 +00:00
file_name
)
@property
def thumbnail_file(self):
return open(self.thumbnail_path, "rb")
2016-02-27 20:18:50 +00:00
class Log(models.Model):
LEVELS = (
2021-01-02 00:45:23 +01:00
(logging.DEBUG, _("debug")),
(logging.INFO, _("information")),
(logging.WARNING, _("warning")),
(logging.ERROR, _("error")),
(logging.CRITICAL, _("critical")),
2016-02-27 20:18:50 +00:00
)
2020-12-30 21:48:34 +01:00
group = models.UUIDField(
_("group"),
blank=True, null=True)
message = models.TextField(_("message"))
level = models.PositiveIntegerField(
_("level"),
choices=LEVELS, default=logging.INFO)
created = models.DateTimeField(_("created"), auto_now_add=True)
2018-05-27 23:21:36 +01:00
class Meta:
2020-11-02 01:24:56 +01:00
ordering = ("-created",)
2020-12-30 21:48:34 +01:00
verbose_name = _("log")
verbose_name_plural = _("logs")
2016-02-27 20:18:50 +00:00
def __str__(self):
return self.message
2020-11-21 12:12:19 +01:00
2020-12-12 15:46:56 +01:00
class SavedView(models.Model):
class Meta:
ordering = ("name",)
2020-12-30 21:48:34 +01:00
verbose_name = _("saved view")
verbose_name_plural = _("saved views")
2020-12-30 21:48:34 +01:00
user = models.ForeignKey(User, on_delete=models.CASCADE,
verbose_name=_("user"))
name = models.CharField(
_("name"),
max_length=128)
2020-12-12 15:46:56 +01:00
2020-12-30 21:48:34 +01:00
show_on_dashboard = models.BooleanField(
_("show on dashboard"),
)
show_in_sidebar = models.BooleanField(
_("show in sidebar"),
)
2020-12-12 15:46:56 +01:00
2020-12-30 21:48:34 +01:00
sort_field = models.CharField(
_("sort field"),
max_length=128)
sort_reverse = models.BooleanField(
_("sort reverse"),
default=False)
2020-12-12 15:46:56 +01:00
class SavedViewFilterRule(models.Model):
RULE_TYPES = [
2020-12-30 21:48:34 +01:00
(0, _("title contains")),
(1, _("content contains")),
(2, _("ASN is")),
(3, _("correspondent is")),
(4, _("document type is")),
(5, _("is in inbox")),
(6, _("has tag")),
(7, _("has any tag")),
(8, _("created before")),
(9, _("created after")),
(10, _("created year is")),
(11, _("created month is")),
(12, _("created day is")),
(13, _("added before")),
(14, _("added after")),
(15, _("modified before")),
(16, _("modified after")),
(17, _("does not have tag")),
(18, _("does not have ASN")),
(19, _("title or content contains")),
2020-12-12 15:46:56 +01:00
]
2020-12-15 12:06:24 +01:00
saved_view = models.ForeignKey(
SavedView,
on_delete=models.CASCADE,
2020-12-30 21:48:34 +01:00
related_name="filter_rules",
verbose_name=_("saved view")
2020-12-15 12:06:24 +01:00
)
2020-12-12 15:46:56 +01:00
2020-12-30 21:48:34 +01:00
rule_type = models.PositiveIntegerField(
_("rule type"),
choices=RULE_TYPES)
value = models.CharField(
_("value"),
2021-01-01 23:08:49 +01:00
max_length=128,
blank=True,
null=True)
2020-12-12 15:46:56 +01:00
2020-12-30 21:48:34 +01:00
class Meta:
verbose_name = _("filter rule")
verbose_name_plural = _("filter rules")
2020-12-12 15:46:56 +01:00
# TODO: why is this in the models file?
2018-04-22 16:28:03 +01:00
class FileInfo:
2016-03-24 19:18:33 +00:00
REGEXES = OrderedDict([
("created-title", re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - "
r"(?P<title>.*)$",
2016-03-24 19:18:33 +00:00
flags=re.IGNORECASE
)),
("title", re.compile(
r"(?P<title>.*)$",
2016-03-24 19:18:33 +00:00
flags=re.IGNORECASE
))
])
def __init__(self, created=None, correspondent=None, title=None, tags=(),
extension=None):
self.created = created
self.title = title
self.extension = extension
self.correspondent = correspondent
self.tags = tags
@classmethod
def _get_created(cls, created):
2018-04-22 16:27:43 +01:00
try:
return dateutil.parser.parse("{:0<14}Z".format(created[:-1]))
except ValueError:
return None
2016-03-24 19:18:33 +00:00
@classmethod
def _get_title(cls, title):
return title
@classmethod
def _mangle_property(cls, properties, name):
if name in properties:
properties[name] = getattr(cls, "_get_{}".format(name))(
properties[name]
)
@classmethod
def from_filename(cls, filename):
# Mutate filename in-place before parsing its components
# by applying at most one of the configured transformations.
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:
(filename, count) = pattern.subn(repl, filename)
if count:
break
# do this after the transforms so that the transforms can do whatever
# with the file extension.
filename_no_ext = os.path.splitext(filename)[0]
if filename_no_ext == filename and filename.startswith("."):
# This is a very special case where there is no text before the
# file type.
# TODO: this should be handled better. The ext is not removed
# because usually, files like '.pdf' are just hidden files
# with the name pdf, but in our case, its more likely that
# there's just no name to begin with.
filename = ""
# This isn't too bad either, since we'll just not match anything
# and return an empty title. TODO: actually, this is kinda bad.
else:
filename = filename_no_ext
# Parse filename components.
2016-03-24 19:18:33 +00:00
for regex in cls.REGEXES.values():
m = regex.match(filename)
2016-03-24 19:18:33 +00:00
if m:
properties = m.groupdict()
cls._mangle_property(properties, "created")
cls._mangle_property(properties, "title")
return cls(**properties)