paperless-ngx/src/documents/models.py

import os
import re

from django.conf import settings
from django.core.urlresolvers import reverse
from django.db import models
from django.template.defaultfilters import slugify
from django.utils import timezone


class SluggedModel(models.Model):

    name = models.CharField(max_length=128, unique=True)
    slug = models.SlugField(blank=True)

    class Meta(object):
        abstract = True

    def save(self, *args, **kwargs):
        if not self.slug:
            self.slug = slugify(self.name)
        models.Model.save(self, *args, **kwargs)

    def __str__(self):
        return self.name


class Sender(SluggedModel):

    # This regex is probably more restrictive than it needs to be, but it's
    # better safe than sorry.
    SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")

    class Meta(object):
        ordering = ("name",)


class Tag(SluggedModel):

    COLOURS = (
        (1, "#a6cee3"),
        (2, "#1f78b4"),
        (3, "#b2df8a"),
        (4, "#33a02c"),
        (5, "#fb9a99"),
        (6, "#e31a1c"),
        (7, "#fdbf6f"),
        (8, "#ff7f00"),
        (9, "#cab2d6"),
        (10, "#6a3d9a"),
        (11, "#b15928"),
        (12, "#000000"),
        (13, "#cccccc")
    )

    MATCH_ANY = 1
    MATCH_ALL = 2
    MATCH_LITERAL = 3
    MATCH_REGEX = 4
    MATCHING_ALGORITHMS = (
        (MATCH_ANY, "Any"),
        (MATCH_ALL, "All"),
        (MATCH_LITERAL, "Literal"),
        (MATCH_REGEX, "Regular Expression"),
    )

    colour = models.PositiveIntegerField(choices=COLOURS, default=1)
    match = models.CharField(max_length=256, blank=True)
    matching_algorithm = models.PositiveIntegerField(
        choices=MATCHING_ALGORITHMS,
        default=MATCH_ANY,
        help_text=(
            "Which algorithm you want to use when matching text to the OCR'd "
            "PDF.  Here, \"any\" looks for any occurrence of any word "
            "provided in the PDF, while \"all\" requires that every word "
            "provided appear in the PDF, albeit not in the order provided.  A "
            "\"literal\" match means that the text you enter must appear in "
            "the PDF exactly as you've entered it, and \"regular expression\" "
            "uses a regex to match the PDF.  If you don't know what a regex "
            "is, you probably don't want this option."
        )
    )

    @property
    def conditions(self):
        return "{}: \"{}\" ({})".format(
            self.name, self.match, self.get_matching_algorithm_display())

    @classmethod
    def match_all(cls, text, tags=None):

        if tags is None:
            tags = cls.objects.all()

        text = text.lower()
        for tag in tags:
            if tag.matches(text):
                yield tag

    def matches(self, text):

        # Check that match is not empty
        if self.match.strip() == "":
            return False

        if self.matching_algorithm == self.MATCH_ALL:
            for word in self.match.split(" "):
                if not re.search(r"\b{}\b".format(word), text):
                    return False
            return True

        if self.matching_algorithm == self.MATCH_ANY:
            for word in self.match.split(" "):
                if re.search(r"\b{}\b".format(word), text):
                    return True
            return False

        if self.matching_algorithm == self.MATCH_LITERAL:
            return bool(re.search(r"\b{}\b".format(self.match), text))

        if self.matching_algorithm == self.MATCH_REGEX:
            return bool(re.search(re.compile(self.match), text))

        raise NotImplementedError("Unsupported matching algorithm")

    def save(self, *args, **kwargs):
        self.match = self.match.lower()
        SluggedModel.save(self, *args, **kwargs)


class Document(models.Model):

    TYPE_PDF = "pdf"
    TYPE_PNG = "png"
    TYPE_JPG = "jpg"
    TYPE_GIF = "gif"
    TYPE_TIF = "tiff"
    TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)

    sender = models.ForeignKey(
        Sender, blank=True, null=True, related_name="documents")
    title = models.CharField(max_length=128, blank=True, db_index=True)
    content = models.TextField(db_index=True)
    file_type = models.CharField(
        max_length=4,
        editable=False,
        choices=tuple([(t, t.upper()) for t in TYPES])
    )
    tags = models.ManyToManyField(
        Tag, related_name="documents", blank=True)
    created = models.DateTimeField(default=timezone.now, editable=False)
    modified = models.DateTimeField(auto_now=True, editable=False)

    class Meta(object):
        ordering = ("sender", "title")

    def __str__(self):
        created = self.created.strftime("%Y-%m-%d")
        if self.sender and self.title:
            return "{}: {}, {}".format(created, self.sender, self.title)
        if self.sender or self.title:
            return "{}: {}".format(created, self.sender or self.title)
        return str(created)

    @property
    def source_path(self):
        return os.path.join(
            settings.MEDIA_ROOT,
            "documents",
            "{:07}.{}.gpg".format(self.pk, self.file_type)
        )

    @property
    def source_file(self):
        return open(self.source_path, "rb")

    @property
    def file_name(self):
        if self.sender and self.title:
            tags = ",".join([t.slug for t in self.tags.all()])
            if tags:
                return "{} - {} - {}.{}".format(
                    self.sender, self.title, tags, self.file_type)
            return "{} - {}.{}".format(self.sender, self.title, self.file_type)
        return os.path.basename(self.source_path)

    @property
    def download_url(self):
        return reverse("fetch", kwargs={"pk": self.pk})
Added GPG encryption for the PDFs 2016-01-01 16:13:59 +00:00			`import os`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00			`import re`
Added GPG encryption for the PDFs 2016-01-01 16:13:59 +00:00
			`from django.conf import settings`
Added download_url to the Document model 2016-02-15 22:38:18 +00:00			`from django.core.urlresolvers import reverse`
It works! 2015-12-20 19:23:33 +00:00			`from django.db import models`
Created a Sender model 2016-01-11 12:52:19 +00:00			`from django.template.defaultfilters import slugify`
Better created & modified + __str__() 2015-12-26 13:20:52 +00:00			`from django.utils import timezone`
It works! 2015-12-20 19:23:33 +00:00

Add labels (#9) 2016-01-23 04:40:35 +00:00			`class SluggedModel(models.Model):`
Created a Sender model 2016-01-11 12:52:19 +00:00
			`name = models.CharField(max_length=128, unique=True)`
Add labels (#9) 2016-01-23 04:40:35 +00:00			`slug = models.SlugField(blank=True)`
Created a Sender model 2016-01-11 12:52:19 +00:00
Sort senders by name 2016-01-17 02:09:52 +00:00			`class Meta(object):`
Add labels (#9) 2016-01-23 04:40:35 +00:00			`abstract = True`
Sort senders by name 2016-01-17 02:09:52 +00:00
Created a Sender model 2016-01-11 12:52:19 +00:00			`def save(self, args, *kwargs):`
			`if not self.slug:`
			`self.slug = slugify(self.name)`
			`models.Model.save(self, args, *kwargs)`

			`def __str__(self):`
			`return self.name`


Add labels (#9) 2016-01-23 04:40:35 +00:00			`class Sender(SluggedModel):`

The 'API' is written but untested 2016-02-08 23:46:16 +00:00			`# This regex is probably more restrictive than it needs to be, but it's`
			`# better safe than sorry.`
			`SAFE_REGEX = re.compile(r"^[\w\- ,.']+$")`

Add labels (#9) 2016-01-23 04:40:35 +00:00			`class Meta(object):`
			`ordering = ("name",)`


			`class Tag(SluggedModel):`
pep8 2016-02-21 00:14:50 +00:00
Add labels (#9) 2016-01-23 04:40:35 +00:00			`COLOURS = (`
			`(1, "#a6cee3"),`
			`(2, "#1f78b4"),`
			`(3, "#b2df8a"),`
			`(4, "#33a02c"),`
			`(5, "#fb9a99"),`
			`(6, "#e31a1c"),`
			`(7, "#fdbf6f"),`
			`(8, "#ff7f00"),`
			`(9, "#cab2d6"),`
			`(10, "#6a3d9a"),`
			`(11, "#b15928"),`
			`(12, "#000000"),`
			`(13, "#cccccc")`
			`)`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00
			`MATCH_ANY = 1`
			`MATCH_ALL = 2`
			`MATCH_LITERAL = 3`
			`MATCH_REGEX = 4`
			`MATCHING_ALGORITHMS = (`
			`(MATCH_ANY, "Any"),`
			`(MATCH_ALL, "All"),`
			`(MATCH_LITERAL, "Literal"),`
			`(MATCH_REGEX, "Regular Expression"),`
			`)`

Add labels (#9) 2016-01-23 04:40:35 +00:00			`colour = models.PositiveIntegerField(choices=COLOURS, default=1)`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00			`match = models.CharField(max_length=256, blank=True)`
			`matching_algorithm = models.PositiveIntegerField(`
			`choices=MATCHING_ALGORITHMS,`
Added a default algorithm 2016-02-14 01:30:41 +00:00			`default=MATCH_ANY,`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00			`help_text=(`
			`"Which algorithm you want to use when matching text to the OCR'd "`
pep8 2016-02-21 00:14:50 +00:00			`"PDF. Here, \"any\" looks for any occurrence of any word "`
			`"provided in the PDF, while \"all\" requires that every word "`
			`"provided appear in the PDF, albeit not in the order provided. A "`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00			`"\"literal\" match means that the text you enter must appear in "`
			`"the PDF exactly as you've entered it, and \"regular expression\" "`
The 'API' is written but untested 2016-02-08 23:46:16 +00:00			`"uses a regex to match the PDF. If you don't know what a regex "`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00			`"is, you probably don't want this option."`
			`)`
			`)`

			`@property`
			`def conditions(self):`
			`return "{}: \"{}\" ({})".format(`
			`self.name, self.match, self.get_matching_algorithm_display())`

Fixes #45 2016-02-17 23:07:54 +00:00			`@classmethod`
			`def match_all(cls, text, tags=None):`

			`if tags is None:`
			`tags = cls.objects.all()`

			`text = text.lower()`
			`for tag in tags:`
			`if tag.matches(text):`
			`yield tag`

#11: automatic tagging support 2016-01-28 07:23:11 +00:00			`def matches(self, text):`
Fixes #45 2016-02-17 23:07:54 +00:00
Fix matching if user supplied an empty value 2016-02-14 19:47:05 +01:00			`# Check that match is not empty`
			`if self.match.strip() == "":`
			`return False`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00
			`if self.matching_algorithm == self.MATCH_ALL:`
			`for word in self.match.split(" "):`
#48: make the tag matching smarter 2016-02-19 00:45:02 +00:00			`if not re.search(r"\b{}\b".format(word), text):`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00			`return False`
			`return True`

			`if self.matching_algorithm == self.MATCH_ANY:`
			`for word in self.match.split(" "):`
#48: make the tag matching smarter 2016-02-19 00:45:02 +00:00			`if re.search(r"\b{}\b".format(word), text):`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00			`return True`
			`return False`

			`if self.matching_algorithm == self.MATCH_LITERAL:`
#48: make the tag matching smarter 2016-02-19 00:45:02 +00:00			`return bool(re.search(r"\b{}\b".format(self.match), text))`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00
			`if self.matching_algorithm == self.MATCH_REGEX:`
#48: make the tag matching smarter 2016-02-19 00:45:02 +00:00			`return bool(re.search(re.compile(self.match), text))`
#11: automatic tagging support 2016-01-28 07:23:11 +00:00
			`raise NotImplementedError("Unsupported matching algorithm")`

			`def save(self, args, *kwargs):`
			`self.match = self.match.lower()`
			`SluggedModel.save(self, args, *kwargs)`
Add labels (#9) 2016-01-23 04:40:35 +00:00

It works! 2015-12-20 19:23:33 +00:00			`class Document(models.Model):`

#12: Support image documents 2016-01-29 23:18:03 +00:00			`TYPE_PDF = "pdf"`
			`TYPE_PNG = "png"`
			`TYPE_JPG = "jpg"`
			`TYPE_GIF = "gif"`
			`TYPE_TIF = "tiff"`
			`TYPES = (TYPE_PDF, TYPE_PNG, TYPE_JPG, TYPE_GIF, TYPE_TIF,)`

Fixed a few consumer bugs and added an exporter Rename exporter to export and fixt some debugging Account for files not matching the sender/title pattern Added a safety note Wrong regex on the name parser Renamed the command to something slightly less ambiguous 2016-01-14 19:47:57 +00:00			`sender = models.ForeignKey(`
			`Sender, blank=True, null=True, related_name="documents")`
It works! 2015-12-20 19:23:33 +00:00			`title = models.CharField(max_length=128, blank=True, db_index=True)`
			`content = models.TextField(db_index=True)`
#12: Support image documents 2016-01-29 23:18:03 +00:00			`file_type = models.CharField(`
			`max_length=4,`
			`editable=False,`
			`choices=tuple([(t, t.upper()) for t in TYPES])`
			`)`
The 'API' is written but untested 2016-02-08 23:46:16 +00:00			`tags = models.ManyToManyField(`
			`Tag, related_name="documents", blank=True)`
Better created & modified + __str__() 2015-12-26 13:20:52 +00:00			`created = models.DateTimeField(default=timezone.now, editable=False)`
			`modified = models.DateTimeField(auto_now=True, editable=False)`

			`class Meta(object):`
			`ordering = ("sender", "title")`

			`def __str__(self):`
			`created = self.created.strftime("%Y-%m-%d")`
			`if self.sender and self.title:`
			`return "{}: {}, {}".format(created, self.sender, self.title)`
			`if self.sender or self.title:`
Fixed a few consumer bugs and added an exporter Rename exporter to export and fixt some debugging Account for files not matching the sender/title pattern Added a safety note Wrong regex on the name parser Renamed the command to something slightly less ambiguous 2016-01-14 19:47:57 +00:00			`return "{}: {}".format(created, self.sender or self.title)`
Better created & modified + __str__() 2015-12-26 13:20:52 +00:00			`return str(created)`
Added GPG encryption for the PDFs 2016-01-01 16:13:59 +00:00
			`@property`
#12: Support image documents 2016-01-29 23:18:03 +00:00			`def source_path(self):`
Added GPG encryption for the PDFs 2016-01-01 16:13:59 +00:00			`return os.path.join(`
			`settings.MEDIA_ROOT,`
			`"documents",`
#12: Support image documents 2016-01-29 23:18:03 +00:00			`"{:07}.{}.gpg".format(self.pk, self.file_type)`
Added GPG encryption for the PDFs 2016-01-01 16:13:59 +00:00			`)`

			`@property`
#12: Support image documents 2016-01-29 23:18:03 +00:00			`def source_file(self):`
			`return open(self.source_path, "rb")`
Fixed a few consumer bugs and added an exporter Rename exporter to export and fixt some debugging Account for files not matching the sender/title pattern Added a safety note Wrong regex on the name parser Renamed the command to something slightly less ambiguous 2016-01-14 19:47:57 +00:00
			`@property`
Added download_url to the Document model 2016-02-15 22:38:18 +00:00			`def file_name(self):`
Fixed a few consumer bugs and added an exporter Rename exporter to export and fixt some debugging Account for files not matching the sender/title pattern Added a safety note Wrong regex on the name parser Renamed the command to something slightly less ambiguous 2016-01-14 19:47:57 +00:00			`if self.sender and self.title:`
Added pytest and broke up the consumer into file and mail 2016-02-05 00:23:36 +00:00			`tags = ",".join([t.slug for t in self.tags.all()])`
			`if tags:`
			`return "{} - {} - {}.{}".format(`
			`self.sender, self.title, tags, self.file_type)`
			`return "{} - {}.{}".format(self.sender, self.title, self.file_type)`
#12: Support image documents 2016-01-29 23:18:03 +00:00			`return os.path.basename(self.source_path)`
Added download_url to the Document model 2016-02-15 22:38:18 +00:00
			`@property`
			`def download_url(self):`
			`return reverse("fetch", kwargs={"pk": self.pk})`