2018-01-06 17:23:07 +00:00
# coding=utf-8
2016-02-27 20:18:50 +00:00
import logging
2016-01-01 16:13:59 +00:00
import os
2016-01-28 07:23:11 +00:00
import re
2016-03-06 17:26:07 +00:00
import uuid
2016-03-24 19:18:33 +00:00
from collections import OrderedDict
2018-09-09 21:03:37 +01:00
import dateutil . parser
2016-01-01 16:13:59 +00:00
from django . conf import settings
2015-12-20 19:23:33 +00:00
from django . db import models
2016-01-11 12:52:19 +00:00
from django . template . defaultfilters import slugify
2015-12-26 13:20:52 +00:00
from django . utils import timezone
2018-09-09 21:03:37 +01:00
from fuzzywuzzy import fuzz
2015-12-20 19:23:33 +00:00
2016-02-28 00:41:03 +00:00
from . managers import LogManager
2018-09-09 21:03:37 +01:00
try :
from django . core . urlresolvers import reverse
except ImportError :
from django . urls import reverse
2015-12-20 19:23:33 +00:00
2016-03-28 11:11:15 +01:00
class MatchingModel ( models . Model ) :
2016-01-28 07:23:11 +00:00
2016-03-28 11:11:15 +01:00
name = models . CharField ( max_length = 128 , unique = True )
slug = models . SlugField ( blank = True )
2018-09-04 18:40:26 +02:00
automatic_classification = models . BooleanField ( default = False , help_text = ' Automatically assign to newly added documents based on current usage in your document collection. ' )
2016-10-05 23:43:55 +02:00
2018-05-27 23:21:36 +01:00
class Meta :
2016-03-28 11:11:15 +01:00
abstract = True
2018-09-02 20:56:45 +01:00
ordering = ( " name " , )
2016-03-28 11:11:15 +01:00
def __str__ ( self ) :
return self . name
2016-01-28 07:23:11 +00:00
def save ( self , * args , * * kwargs ) :
2016-03-28 11:11:15 +01:00
if not self . slug :
self . slug = slugify ( self . name )
models . Model . save ( self , * args , * * kwargs )
class Correspondent ( MatchingModel ) :
# This regex is probably more restrictive than it needs to be, but it's
# better safe than sorry.
SAFE_REGEX = re . compile ( r " ^[ \ w \ - ,. ' ]+$ " )
2018-05-27 23:21:36 +01:00
class Meta :
2016-03-28 11:11:15 +01:00
ordering = ( " name " , )
class Tag ( MatchingModel ) :
COLOURS = (
( 1 , " #a6cee3 " ) ,
( 2 , " #1f78b4 " ) ,
( 3 , " #b2df8a " ) ,
( 4 , " #33a02c " ) ,
( 5 , " #fb9a99 " ) ,
( 6 , " #e31a1c " ) ,
( 7 , " #fdbf6f " ) ,
( 8 , " #ff7f00 " ) ,
( 9 , " #cab2d6 " ) ,
( 10 , " #6a3d9a " ) ,
( 11 , " #b15928 " ) ,
( 12 , " #000000 " ) ,
( 13 , " #cccccc " )
)
colour = models . PositiveIntegerField ( choices = COLOURS , default = 1 )
2016-01-23 04:40:35 +00:00
2018-07-06 13:25:02 +02:00
is_inbox_tag = models . BooleanField (
default = False ,
help_text = " Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags. " )
is_archived_tag = models . BooleanField (
default = False ,
help_text = " Marks this tag as an archive tag: All documents tagged with archive tags will never be modified automatically (i.e., modifying tags by matching rules) " )
2016-01-23 04:40:35 +00:00
2018-08-24 13:45:15 +02:00
class DocumentType ( MatchingModel ) :
pass
2015-12-20 19:23:33 +00:00
class Document ( models . Model ) :
2016-01-29 23:18:03 +00:00
TYPE_PDF = " pdf "
TYPE_PNG = " png "
TYPE_JPG = " jpg "
TYPE_GIF = " gif "
TYPE_TIF = " tiff "
2018-09-03 23:46:13 -04:00
TYPE_TXT = " txt "
TYPE_CSV = " csv "
2018-09-09 20:55:37 +01:00
TYPE_MD = " md "
2018-09-03 23:46:13 -04:00
TYPES = ( TYPE_PDF , TYPE_PNG , TYPE_JPG , TYPE_GIF , TYPE_TIF ,
TYPE_TXT , TYPE_CSV , TYPE_MD )
2016-01-29 23:18:03 +00:00
2018-02-04 13:13:24 +00:00
STORAGE_TYPE_UNENCRYPTED = " unencrypted "
STORAGE_TYPE_GPG = " gpg "
STORAGE_TYPES = (
( STORAGE_TYPE_UNENCRYPTED , " Unencrypted " ) ,
( STORAGE_TYPE_GPG , " Encrypted with GNU Privacy Guard " )
)
2016-03-04 09:14:50 +00:00
correspondent = models . ForeignKey (
2017-07-15 19:06:52 +01:00
Correspondent ,
blank = True ,
null = True ,
related_name = " documents " ,
on_delete = models . SET_NULL
)
2017-03-11 16:37:30 +00:00
2015-12-20 19:23:33 +00:00
title = models . CharField ( max_length = 128 , blank = True , db_index = True )
2017-03-11 16:37:30 +00:00
2018-08-24 13:45:15 +02:00
document_type = models . ForeignKey (
DocumentType ,
blank = True ,
null = True ,
related_name = " documents " ,
on_delete = models . SET_NULL
)
2017-03-11 16:37:30 +00:00
content = models . TextField (
db_index = True ,
blank = True ,
help_text = " The raw, text-only data of the document. This field is "
" primarily used for searching. "
)
2016-01-29 23:18:03 +00:00
file_type = models . CharField (
max_length = 4 ,
editable = False ,
choices = tuple ( [ ( t , t . upper ( ) ) for t in TYPES ] )
)
2017-03-11 16:37:30 +00:00
2016-02-08 23:46:16 +00:00
tags = models . ManyToManyField (
Tag , related_name = " documents " , blank = True )
2016-04-03 16:34:09 +01:00
checksum = models . CharField (
max_length = 32 ,
editable = False ,
unique = True ,
help_text = " The checksum of the original document (before it was "
" encrypted). We use this to prevent duplicate document "
" imports. "
)
created = models . DateTimeField (
default = timezone . now , db_index = True )
modified = models . DateTimeField (
auto_now = True , editable = False , db_index = True )
2018-06-17 16:32:51 +01:00
2018-02-04 13:13:24 +00:00
storage_type = models . CharField (
max_length = 11 ,
choices = STORAGE_TYPES ,
2018-05-27 23:17:21 +01:00
default = STORAGE_TYPE_UNENCRYPTED ,
2018-02-04 13:13:24 +00:00
editable = False
)
2015-12-26 13:20:52 +00:00
2018-04-26 11:58:05 +02:00
added = models . DateTimeField (
default = timezone . now , editable = False , db_index = True )
2015-12-26 13:20:52 +00:00
2018-07-06 13:25:02 +02:00
archive_serial_number = models . IntegerField (
blank = True ,
null = True ,
unique = True ,
db_index = True ,
help_text = " The position of this document in your physical document archive. " )
2018-05-27 23:21:36 +01:00
class Meta :
2016-03-04 09:14:50 +00:00
ordering = ( " correspondent " , " title " )
2015-12-26 13:20:52 +00:00
def __str__ ( self ) :
2016-03-03 20:52:42 +00:00
created = self . created . strftime ( " % Y % m %d % H % M % S " )
2016-03-04 09:14:50 +00:00
if self . correspondent and self . title :
return " {} : {} - {} " . format (
created , self . correspondent , self . title )
if self . correspondent or self . title :
return " {} : {} " . format ( created , self . correspondent or self . title )
2015-12-26 13:20:52 +00:00
return str ( created )
2016-01-01 16:13:59 +00:00
@property
2016-01-29 23:18:03 +00:00
def source_path ( self ) :
2018-02-04 13:13:24 +00:00
file_name = " {:07} . {} " . format ( self . pk , self . file_type )
if self . storage_type == self . STORAGE_TYPE_GPG :
file_name + = " .gpg "
2016-01-01 16:13:59 +00:00
return os . path . join (
settings . MEDIA_ROOT ,
" documents " ,
2016-03-05 01:57:49 +00:00
" originals " ,
2018-02-04 13:13:24 +00:00
file_name
2016-01-01 16:13:59 +00:00
)
@property
2016-01-29 23:18:03 +00:00
def source_file ( self ) :
return open ( self . source_path , " rb " )
2016-01-14 19:47:57 +00:00
@property
2016-02-15 22:38:18 +00:00
def file_name ( self ) :
2016-03-03 20:52:42 +00:00
return slugify ( str ( self ) ) + " . " + self . file_type
2016-02-15 22:38:18 +00:00
@property
def download_url ( self ) :
2016-03-05 01:57:49 +00:00
return reverse ( " fetch " , kwargs = { " kind " : " doc " , " pk " : self . pk } )
@property
def thumbnail_path ( self ) :
2018-02-04 13:13:24 +00:00
file_name = " {:07} .png " . format ( self . pk )
if self . storage_type == self . STORAGE_TYPE_GPG :
file_name + = " .gpg "
2016-03-05 01:57:49 +00:00
return os . path . join (
settings . MEDIA_ROOT ,
" documents " ,
" thumbnails " ,
2018-02-04 13:13:24 +00:00
file_name
2016-03-05 01:57:49 +00:00
)
@property
def thumbnail_file ( self ) :
return open ( self . thumbnail_path , " rb " )
@property
def thumbnail_url ( self ) :
return reverse ( " fetch " , kwargs = { " kind " : " thumb " , " pk " : self . pk } )
2016-02-27 20:18:50 +00:00
class Log ( models . Model ) :
LEVELS = (
( logging . DEBUG , " Debugging " ) ,
( logging . INFO , " Informational " ) ,
( logging . WARNING , " Warning " ) ,
( logging . ERROR , " Error " ) ,
( logging . CRITICAL , " Critical " ) ,
)
group = models . UUIDField ( blank = True )
message = models . TextField ( )
level = models . PositiveIntegerField ( choices = LEVELS , default = logging . INFO )
created = models . DateTimeField ( auto_now_add = True )
modified = models . DateTimeField ( auto_now = True )
2016-02-28 00:41:03 +00:00
objects = LogManager ( )
2018-05-27 23:21:36 +01:00
class Meta :
2016-02-27 20:18:50 +00:00
ordering = ( " -modified " , )
def __str__ ( self ) :
return self . message
2016-03-06 17:26:07 +00:00
def save ( self , * args , * * kwargs ) :
"""
To allow for the case where we don ' t want to group the message, we
shouldn ' t force the caller to specify a one-time group value. However,
allowing group = None means that the manager can ' t differentiate the
different un - grouped messages , so instead we set a random one here .
"""
if not self . group :
self . group = uuid . uuid4 ( )
models . Model . save ( self , * args , * * kwargs )
2016-03-24 19:18:33 +00:00
2018-04-22 16:28:03 +01:00
class FileInfo :
2016-03-24 19:18:33 +00:00
# This epic regex *almost* worked for our needs, so I'm keeping it here for
# posterity, in the hopes that we might find a way to make it work one day.
ALMOST_REGEX = re . compile (
r " ^((?P<date> \ d \ d \ d \ d \ d \ d \ d \ d \ d \ d \ d \ d \ d \ dZ) {separator} )? "
r " ((?P<correspondent> {non_separated_word} +) {separator} )?? "
r " (?P<title> {non_separated_word} +) "
r " ( {separator} (?P<tags>[a-z,0-9-]+))? "
r " \ .(?P<extension>[a-zA-Z.-]+)$ " . format (
separator = r " \ s+- \ s+ " ,
non_separated_word = r " ([ \ w,. ]|([^ \ s]-)) "
)
)
2018-09-03 23:46:13 -04:00
formats = " pdf|jpe?g|png|gif|tiff?|te?xt|md|csv "
2016-03-24 19:18:33 +00:00
REGEXES = OrderedDict ( [
( " created-correspondent-title-tags " , re . compile (
r " ^(?P<created> \ d \ d \ d \ d \ d \ d \ d \ d( \ d \ d \ d \ d \ d \ d)?Z) - "
r " (?P<correspondent>.*) - "
r " (?P<title>.*) - "
r " (?P<tags>[a-z0-9 \ -,]*) "
2018-09-03 23:46:13 -04:00
r " \ .(?P<extension> {} )$ " . format ( formats ) ,
2016-03-24 19:18:33 +00:00
flags = re . IGNORECASE
) ) ,
( " created-title-tags " , re . compile (
r " ^(?P<created> \ d \ d \ d \ d \ d \ d \ d \ d( \ d \ d \ d \ d \ d \ d)?Z) - "
r " (?P<title>.*) - "
r " (?P<tags>[a-z0-9 \ -,]*) "
2018-09-03 23:46:13 -04:00
r " \ .(?P<extension> {} )$ " . format ( formats ) ,
2016-03-24 19:18:33 +00:00
flags = re . IGNORECASE
) ) ,
( " created-correspondent-title " , re . compile (
r " ^(?P<created> \ d \ d \ d \ d \ d \ d \ d \ d( \ d \ d \ d \ d \ d \ d)?Z) - "
r " (?P<correspondent>.*) - "
r " (?P<title>.*) "
2018-09-03 23:46:13 -04:00
r " \ .(?P<extension> {} )$ " . format ( formats ) ,
2016-03-24 19:18:33 +00:00
flags = re . IGNORECASE
) ) ,
( " created-title " , re . compile (
r " ^(?P<created> \ d \ d \ d \ d \ d \ d \ d \ d( \ d \ d \ d \ d \ d \ d)?Z) - "
r " (?P<title>.*) "
2018-09-03 23:46:13 -04:00
r " \ .(?P<extension> {} )$ " . format ( formats ) ,
2016-03-24 19:18:33 +00:00
flags = re . IGNORECASE
) ) ,
( " correspondent-title-tags " , re . compile (
r " (?P<correspondent>.*) - "
r " (?P<title>.*) - "
r " (?P<tags>[a-z0-9 \ -,]*) "
2018-09-03 23:46:13 -04:00
r " \ .(?P<extension> {} )$ " . format ( formats ) ,
2016-03-24 19:18:33 +00:00
flags = re . IGNORECASE
) ) ,
( " correspondent-title " , re . compile (
r " (?P<correspondent>.*) - "
r " (?P<title>.*)? "
2018-09-03 23:46:13 -04:00
r " \ .(?P<extension> {} )$ " . format ( formats ) ,
2016-03-24 19:18:33 +00:00
flags = re . IGNORECASE
) ) ,
( " title " , re . compile (
r " (?P<title>.*) "
2018-09-03 23:46:13 -04:00
r " \ .(?P<extension> {} )$ " . format ( formats ) ,
2016-03-24 19:18:33 +00:00
flags = re . IGNORECASE
) )
] )
def __init__ ( self , created = None , correspondent = None , title = None , tags = ( ) ,
extension = None ) :
self . created = created
self . title = title
self . extension = extension
self . correspondent = correspondent
self . tags = tags
@classmethod
def _get_created ( cls , created ) :
2018-04-22 16:27:43 +01:00
try :
return dateutil . parser . parse ( " {:0<14} Z " . format ( created [ : - 1 ] ) )
except ValueError :
return None
2016-03-24 19:18:33 +00:00
@classmethod
def _get_correspondent ( cls , name ) :
if not name :
return None
return Correspondent . objects . get_or_create ( name = name , defaults = {
" slug " : slugify ( name )
} ) [ 0 ]
@classmethod
def _get_title ( cls , title ) :
return title
@classmethod
def _get_tags ( cls , tags ) :
r = [ ]
for t in tags . split ( " , " ) :
2018-09-02 20:48:51 +01:00
r . append ( Tag . objects . get_or_create (
slug = t . lower ( ) ,
defaults = { " name " : t }
) [ 0 ] )
2016-03-24 19:18:33 +00:00
return tuple ( r )
@classmethod
def _get_extension ( cls , extension ) :
r = extension . lower ( )
if r == " jpeg " :
return " jpg "
2017-07-15 17:47:17 +01:00
if r == " tif " :
return " tiff "
2016-03-24 19:18:33 +00:00
return r
@classmethod
def _mangle_property ( cls , properties , name ) :
if name in properties :
properties [ name ] = getattr ( cls , " _get_ {} " . format ( name ) ) (
properties [ name ]
)
@classmethod
def from_path ( cls , path ) :
"""
We use a crude naming convention to make handling the correspondent ,
title , and tags easier :
2016-08-20 18:11:51 +01:00
" <date> - <correspondent> - <title> - <tags>.<suffix> "
2016-03-24 19:18:33 +00:00
" <correspondent> - <title> - <tags>.<suffix> "
" <correspondent> - <title>.<suffix> "
" <title>.<suffix> "
"""
for regex in cls . REGEXES . values ( ) :
m = regex . match ( os . path . basename ( path ) )
if m :
properties = m . groupdict ( )
cls . _mangle_property ( properties , " created " )
cls . _mangle_property ( properties , " correspondent " )
cls . _mangle_property ( properties , " title " )
cls . _mangle_property ( properties , " tags " )
cls . _mangle_property ( properties , " extension " )
return cls ( * * properties )