2021-02-08 20:59:14 +01:00
# Generated by Django 3.1.6 on 2021-02-07 22:26
2021-02-09 19:46:19 +01:00
import datetime
2021-02-08 20:59:14 +01:00
import hashlib
import logging
import os
import shutil
2021-02-10 14:50:20 +01:00
from time import sleep
2021-02-08 20:59:14 +01:00
2021-02-09 19:46:19 +01:00
import pathvalidate
2021-02-08 20:59:14 +01:00
from django . conf import settings
2021-02-09 19:46:19 +01:00
from django . db import migrations , models
from django . template . defaultfilters import slugify
2021-02-08 20:59:14 +01:00
2021-02-09 19:46:19 +01:00
from documents . file_handling import defaultdictNoStr , many_to_dictionary
2021-02-08 20:59:14 +01:00
2021-02-10 14:50:20 +01:00
2021-02-08 20:59:14 +01:00
logger = logging . getLogger ( " paperless.migrations " )
2021-02-10 14:50:20 +01:00
###############################################################################
# This is code copied straight paperless before the change.
###############################################################################
2021-02-08 20:59:14 +01:00
2021-02-09 19:46:19 +01:00
def archive_name_from_filename ( filename ) :
2021-02-08 20:59:14 +01:00
return os . path . splitext ( filename ) [ 0 ] + " .pdf "
def archive_path_old ( doc ) :
if doc . filename :
2021-02-09 19:46:19 +01:00
fname = archive_name_from_filename ( doc . filename )
2021-02-08 20:59:14 +01:00
else :
fname = " {:07} .pdf " . format ( doc . pk )
return os . path . join (
settings . ARCHIVE_DIR ,
fname
)
2021-02-09 19:46:19 +01:00
STORAGE_TYPE_GPG = " gpg "
2021-02-08 20:59:14 +01:00
def archive_path_new ( doc ) :
2021-02-09 19:46:19 +01:00
if doc . archive_filename is not None :
return os . path . join (
settings . ARCHIVE_DIR ,
str ( doc . archive_filename )
)
2021-02-08 20:59:14 +01:00
else :
2021-02-09 19:46:19 +01:00
return None
2021-02-08 20:59:14 +01:00
def source_path ( doc ) :
if doc . filename :
fname = str ( doc . filename )
else :
fname = " {:07} {} " . format ( doc . pk , doc . file_type )
if doc . storage_type == STORAGE_TYPE_GPG :
fname + = " .gpg " # pragma: no cover
return os . path . join (
settings . ORIGINALS_DIR ,
fname
)
2021-02-09 19:46:19 +01:00
def generate_unique_filename ( doc , archive_filename = False ) :
if archive_filename :
old_filename = doc . archive_filename
root = settings . ARCHIVE_DIR
else :
old_filename = doc . filename
root = settings . ORIGINALS_DIR
counter = 0
while True :
new_filename = generate_filename (
doc , counter , archive_filename = archive_filename )
if new_filename == old_filename :
# still the same as before.
return new_filename
if os . path . exists ( os . path . join ( root , new_filename ) ) :
counter + = 1
else :
return new_filename
def generate_filename ( doc , counter = 0 , append_gpg = True , archive_filename = False ) :
path = " "
try :
if settings . PAPERLESS_FILENAME_FORMAT is not None :
tags = defaultdictNoStr ( lambda : slugify ( None ) ,
many_to_dictionary ( doc . tags ) )
tag_list = pathvalidate . sanitize_filename (
" , " . join ( sorted (
[ tag . name for tag in doc . tags . all ( ) ]
) ) ,
replacement_text = " - "
)
if doc . correspondent :
correspondent = pathvalidate . sanitize_filename (
doc . correspondent . name , replacement_text = " - "
)
else :
correspondent = " none "
if doc . document_type :
document_type = pathvalidate . sanitize_filename (
doc . document_type . name , replacement_text = " - "
)
else :
document_type = " none "
path = settings . PAPERLESS_FILENAME_FORMAT . format (
title = pathvalidate . sanitize_filename (
doc . title , replacement_text = " - " ) ,
correspondent = correspondent ,
document_type = document_type ,
created = datetime . date . isoformat ( doc . created ) ,
created_year = doc . created . year if doc . created else " none " ,
created_month = f " { doc . created . month : 02 } " if doc . created else " none " , # NOQA: E501
created_day = f " { doc . created . day : 02 } " if doc . created else " none " ,
added = datetime . date . isoformat ( doc . added ) ,
added_year = doc . added . year if doc . added else " none " ,
added_month = f " { doc . added . month : 02 } " if doc . added else " none " ,
added_day = f " { doc . added . day : 02 } " if doc . added else " none " ,
tags = tags ,
tag_list = tag_list
) . strip ( )
path = path . strip ( os . sep )
except ( ValueError , KeyError , IndexError ) :
logger . warning (
f " Invalid PAPERLESS_FILENAME_FORMAT: "
f " { settings . PAPERLESS_FILENAME_FORMAT } , falling back to default " )
counter_str = f " _ { counter : 02 } " if counter else " "
filetype_str = " .pdf " if archive_filename else doc . file_type
if len ( path ) > 0 :
filename = f " { path } { counter_str } { filetype_str } "
else :
filename = f " { doc . pk : 07 } { counter_str } { filetype_str } "
# Append .gpg for encrypted files
if append_gpg and doc . storage_type == STORAGE_TYPE_GPG :
filename + = " .gpg "
return filename
2021-02-10 14:50:20 +01:00
###############################################################################
# This code performs bidirection archive file transformation.
###############################################################################
def create_archive_version ( doc , retry_count = 4 ) :
from documents . parsers import get_parser_class_for_mime_type , \
DocumentParser , \
ParseError
logger . info (
f " Regenerating archive document for document ID: { doc . id } "
)
parser_class = get_parser_class_for_mime_type ( doc . mime_type )
for try_num in range ( retry_count ) :
parser : DocumentParser = parser_class ( None , None )
try :
parser . parse ( source_path ( doc ) , doc . mime_type ,
os . path . basename ( doc . filename ) )
doc . content = parser . get_text ( )
if parser . get_archive_path ( ) and os . path . isfile (
parser . get_archive_path ( ) ) :
doc . archive_filename = generate_unique_filename (
doc , archive_filename = True )
with open ( parser . get_archive_path ( ) , " rb " ) as f :
doc . archive_checksum = hashlib . md5 ( f . read ( ) ) . hexdigest ( )
os . makedirs ( os . path . dirname ( archive_path_new ( doc ) ) ,
exist_ok = True )
shutil . copy2 ( parser . get_archive_path ( ) , archive_path_new ( doc ) )
else :
doc . archive_checksum = None
logger . error (
f " Parser did not return an archive document for document "
f " ID: { doc . id } . Removing archive document. "
)
doc . save ( )
return
except ParseError :
if try_num + 1 == retry_count :
logger . exception (
f " Unable to regenerate archive document for ID: { doc . id } . You "
f " need to invoke the document_archiver management command "
f " manually for that document. "
)
doc . archive_checksum = None
doc . save ( )
return
else :
# This is mostly here for the tika parser in docker
# environemnts. The servers for parsing need to come up first,
# and the docker setup doesn't ensure that tika is running
# before attempting migrations.
logger . error ( " Parse error, will try again in 5 seconds... " )
sleep ( 5 )
finally :
parser . cleanup ( )
2021-02-08 20:59:14 +01:00
def move_old_to_new_locations ( apps , schema_editor ) :
Document = apps . get_model ( " documents " , " Document " )
affected_document_ids = set ( )
old_archive_path_to_id = { }
# check for documents that have incorrect archive versions
for doc in Document . objects . filter ( archive_checksum__isnull = False ) :
old_path = archive_path_old ( doc )
if not os . path . isfile ( old_path ) :
raise ValueError (
2021-02-10 00:52:01 +01:00
f " Archived document ID: { doc . id } does not exist at: "
2021-02-08 20:59:14 +01:00
f " { old_path } " )
if old_path in old_archive_path_to_id :
affected_document_ids . add ( doc . id )
affected_document_ids . add ( old_archive_path_to_id [ old_path ] )
else :
old_archive_path_to_id [ old_path ] = doc . id
# check that we can regenerate these archive versions
for doc_id in affected_document_ids :
from documents . parsers import get_parser_class_for_mime_type
doc = Document . objects . get ( id = doc_id )
parser_class = get_parser_class_for_mime_type ( doc . mime_type )
if not parser_class :
raise Exception (
2021-02-10 00:52:01 +01:00
f " Document ID: { doc . id } has an invalid archived document, "
2021-02-08 20:59:14 +01:00
f " but no parsers are available. Cannot migrate. " )
for doc in Document . objects . filter ( archive_checksum__isnull = False ) :
2021-02-08 23:54:07 +01:00
if doc . id in affected_document_ids :
2021-02-09 19:46:19 +01:00
old_path = archive_path_old ( doc )
2021-02-08 23:54:07 +01:00
# remove affected archive versions
if os . path . isfile ( old_path ) :
os . unlink ( old_path )
else :
2021-02-09 19:46:19 +01:00
# Set archive path for unaffected files
2021-02-09 20:54:02 +01:00
doc . archive_filename = archive_name_from_filename ( doc . filename )
2021-02-09 19:46:19 +01:00
Document . objects . filter ( id = doc . id ) . update (
archive_filename = doc . archive_filename
)
2021-02-08 20:59:14 +01:00
# regenerate archive documents
for doc_id in affected_document_ids :
doc = Document . objects . get ( id = doc_id )
2021-02-10 14:50:20 +01:00
create_archive_version ( doc )
2021-02-09 19:46:19 +01:00
2021-02-08 20:59:14 +01:00
def move_new_to_old_locations ( apps , schema_editor ) :
Document = apps . get_model ( " documents " , " Document " )
old_archive_paths = set ( )
for doc in Document . objects . filter ( archive_checksum__isnull = False ) :
new_archive_path = archive_path_new ( doc )
old_archive_path = archive_path_old ( doc )
if old_archive_path in old_archive_paths :
raise ValueError (
f " Cannot migrate: Archive file name { old_archive_path } of "
f " document { doc . filename } would clash with another archive "
f " filename. " )
old_archive_paths . add ( old_archive_path )
if new_archive_path != old_archive_path and os . path . isfile ( old_archive_path ) :
raise ValueError (
f " Cannot migrate: Cannot move { new_archive_path } to "
f " { old_archive_path } : file already exists. "
)
for doc in Document . objects . filter ( archive_checksum__isnull = False ) :
new_archive_path = archive_path_new ( doc )
old_archive_path = archive_path_old ( doc )
2021-02-08 22:49:01 +01:00
if new_archive_path != old_archive_path :
logger . debug ( f " Moving { new_archive_path } to { old_archive_path } " )
shutil . move ( new_archive_path , old_archive_path )
2021-02-08 20:59:14 +01:00
class Migration ( migrations . Migration ) :
dependencies = [
( ' documents ' , ' 1011_auto_20210101_2340 ' ) ,
]
operations = [
2021-02-09 19:46:19 +01:00
migrations . AddField (
model_name = ' document ' ,
name = ' archive_filename ' ,
field = models . FilePathField ( default = None , editable = False , help_text = ' Current archive filename in storage ' , max_length = 1024 , null = True , unique = True , verbose_name = ' archive filename ' ) ,
) ,
migrations . AlterField (
model_name = ' document ' ,
name = ' filename ' ,
field = models . FilePathField ( default = None , editable = False , help_text = ' Current filename in storage ' , max_length = 1024 , null = True , unique = True , verbose_name = ' filename ' ) ,
) ,
2021-02-08 20:59:14 +01:00
migrations . RunPython (
move_old_to_new_locations ,
move_new_to_old_locations
2021-02-09 19:46:19 +01:00
) ,
2021-02-08 20:59:14 +01:00
]