paperless-ngx/src/documents/models.py

190 lines
5.6 KiB
Python
Raw Normal View History

2016-01-01 16:13:59 +00:00
import os
2016-01-28 07:23:11 +00:00
import re
2016-01-01 16:13:59 +00:00
from django.conf import settings
from django.core.urlresolvers import reverse
2015-12-20 19:23:33 +00:00
from django.db import models
2016-01-11 12:52:19 +00:00
from django.template.defaultfilters import slugify
2015-12-26 13:20:52 +00:00
from django.utils import timezone
2015-12-20 19:23:33 +00:00
2016-01-23 04:40:35 +00:00
class SluggedModel(models.Model):
2016-01-11 12:52:19 +00:00
name = models.CharField(max_length=128, unique=True)
2016-01-23 04:40:35 +00:00
slug = models.SlugField(blank=True)
2016-01-11 12:52:19 +00:00
2016-01-17 02:09:52 +00:00
class Meta(object):
2016-01-23 04:40:35 +00:00
abstract = True
2016-01-17 02:09:52 +00:00
2016-01-11 12:52:19 +00:00
def save(self, *args, **kwargs):
if not self.slug:
self.slug = slugify(self.name)
models.Model.save(self, *args, **kwargs)
def __str__(self):
return self.name
2016-01-23 04:40:35 +00:00
class Sender(SluggedModel):
2016-02-08 23:46:16 +00:00
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")
2016-01-23 04:40:35 +00:00
class Meta(object):
ordering = ("name",)
class Tag(SluggedModel):
2016-02-21 00:14:50 +00:00
2016-01-23 04:40:35 +00:00
COLOURS = (
(1, "#a6cee3"),
(2, "#1f78b4"),
(3, "#b2df8a"),
(4, "#33a02c"),
(5, "#fb9a99"),
(6, "#e31a1c"),
(7, "#fdbf6f"),
(8, "#ff7f00"),
(9, "#cab2d6"),
(10, "#6a3d9a"),
(11, "#b15928"),
(12, "#000000"),
(13, "#cccccc")
)
2016-01-28 07:23:11 +00:00
MATCH_ANY = 1
MATCH_ALL = 2
MATCH_LITERAL = 3
MATCH_REGEX = 4
MATCHING_ALGORITHMS = (
(MATCH_ANY, "Any"),
(MATCH_ALL, "All"),
(MATCH_LITERAL, "Literal"),
(MATCH_REGEX, "Regular Expression"),
)
2016-01-23 04:40:35 +00:00
colour = models.PositiveIntegerField(choices=COLOURS, default=1)
2016-01-28 07:23:11 +00:00
match = models.CharField(max_length=256, blank=True)
matching_algorithm = models.PositiveIntegerField(
choices=MATCHING_ALGORITHMS,
2016-02-14 01:30:41 +00:00
default=MATCH_ANY,
2016-01-28 07:23:11 +00:00
help_text=(
"Which algorithm you want to use when matching text to the OCR'd "
2016-02-21 00:14:50 +00:00
"PDF. Here, \"any\" looks for any occurrence of any word "
"provided in the PDF, while \"all\" requires that every word "
"provided appear in the PDF, albeit not in the order provided. A "
2016-01-28 07:23:11 +00:00
"\"literal\" match means that the text you enter must appear in "
"the PDF exactly as you've entered it, and \"regular expression\" "
2016-02-08 23:46:16 +00:00
"uses a regex to match the PDF. If you don't know what a regex "
2016-01-28 07:23:11 +00:00
"is, you probably don't want this option."
)
)
@property
def conditions(self):
return "{}: \"{}\" ({})".format(
self.name, self.match, self.get_matching_algorithm_display())
2016-02-17 23:07:54 +00:00
@classmethod
def match_all(cls, text, tags=None):
if tags is None:
tags = cls.objects.all()
text = text.lower()
for tag in tags:
if tag.matches(text):
yield tag
2016-01-28 07:23:11 +00:00
def matches(self, text):
2016-02-17 23:07:54 +00:00
# Check that match is not empty
if self.match.strip() == "":
return False
2016-01-28 07:23:11 +00:00
if self.matching_algorithm == self.MATCH_ALL:
for word in self.match.split(" "):
2016-02-19 00:45:02 +00:00
if not re.search(r"\b{}\b".format(word), text):
2016-01-28 07:23:11 +00:00
return False
return True
if self.matching_algorithm == self.MATCH_ANY:
for word in self.match.split(" "):
2016-02-19 00:45:02 +00:00
if re.search(r"\b{}\b".format(word), text):
2016-01-28 07:23:11 +00:00
return True
return False
if self.matching_algorithm == self.MATCH_LITERAL:
2016-02-19 00:45:02 +00:00
return bool(re.search(r"\b{}\b".format(self.match), text))
2016-01-28 07:23:11 +00:00
if self.matching_algorithm == self.MATCH_REGEX:
2016-02-19 00:45:02 +00:00
return bool(re.search(re.compile(self.match), text))
2016-01-28 07:23:11 +00:00
raise NotImplementedError("Unsupported matching algorithm")
def save(self, *args, **kwargs):
self.match = self.match.lower()
SluggedModel.save(self, *args, **kwargs)
2016-01-23 04:40:35 +00:00
2015-12-20 19:23:33 +00:00
class Document(models.Model):
2016-01-29 23:18:03 +00:00
TYPE_PDF = "pdf"
TYPE_PNG = "png"
TYPE_JPG = "jpg"
TYPE_GIF = "gif"
TYPE_TIF = "tiff"
TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)
sender = models.ForeignKey(
Sender, blank=True, null=True, related_name="documents")
2015-12-20 19:23:33 +00:00
title = models.CharField(max_length=128, blank=True, db_index=True)
content = models.TextField(db_index=True)
2016-01-29 23:18:03 +00:00
file_type = models.CharField(
max_length=4,
editable=False,
choices=tuple([(t, t.upper()) for t in TYPES])
)
2016-02-08 23:46:16 +00:00
tags = models.ManyToManyField(
Tag, related_name="documents", blank=True)
2015-12-26 13:20:52 +00:00
created = models.DateTimeField(default=timezone.now, editable=False)
modified = models.DateTimeField(auto_now=True, editable=False)
class Meta(object):
ordering = ("sender", "title")
def __str__(self):
created = self.created.strftime("%Y-%m-%d")
if self.sender and self.title:
return "{}: {}, {}".format(created, self.sender, self.title)
if self.sender or self.title:
return "{}: {}".format(created, self.sender or self.title)
2015-12-26 13:20:52 +00:00
return str(created)
2016-01-01 16:13:59 +00:00
@property
2016-01-29 23:18:03 +00:00
def source_path(self):
2016-01-01 16:13:59 +00:00
return os.path.join(
settings.MEDIA_ROOT,
"documents",
2016-01-29 23:18:03 +00:00
"{:07}.{}.gpg".format(self.pk, self.file_type)
2016-01-01 16:13:59 +00:00
)
@property
2016-01-29 23:18:03 +00:00
def source_file(self):
return open(self.source_path, "rb")
@property
def file_name(self):
if self.sender and self.title:
tags = ",".join([t.slug for t in self.tags.all()])
if tags:
return "{} - {} - {}.{}".format(
self.sender, self.title, tags, self.file_type)
return "{} - {}.{}".format(self.sender, self.title, self.file_type)
2016-01-29 23:18:03 +00:00
return os.path.basename(self.source_path)
@property
def download_url(self):
return reverse("fetch", kwargs={"pk": self.pk})