paperless-ngx/src/documents/file_handling.py

import datetime
import logging
import os
from collections import defaultdict

import pathvalidate
from django.conf import settings
from django.template.defaultfilters import slugify


def create_source_path_directory(source_path):
    os.makedirs(os.path.dirname(source_path), exist_ok=True)


def delete_empty_directories(directory, root):
    if not os.path.isdir(directory):
        return

    # Go up in the directory hierarchy and try to delete all directories
    directory = os.path.normpath(directory)
    root = os.path.normpath(root)

    if not directory.startswith(root + os.path.sep):
        # don't do anything outside our originals folder.

        # append os.path.set so that we avoid these cases:
        #   directory = /home/originals2/test
        #   root = /home/originals ("/" gets appended and startswith fails)
        return

    while directory != root:
        if not os.listdir(directory):
            # it's empty
            try:
                os.rmdir(directory)
            except OSError:
                # whatever. empty directories aren't that bad anyway.
                return
        else:
            # it's not empty.
            return

        # go one level up
        directory = os.path.normpath(os.path.dirname(directory))


def many_to_dictionary(field):
    # Converts ManyToManyField to dictionary by assuming, that field
    # entries contain an _ or - which will be used as a delimiter
    mydictionary = dict()

    for index, t in enumerate(field.all()):
        # Populate tag names by index
        mydictionary[index] = slugify(t.name)

        # Find delimiter
        delimiter = t.name.find('_')

        if delimiter == -1:
            delimiter = t.name.find('-')

        if delimiter == -1:
            continue

        key = t.name[:delimiter]
        value = t.name[delimiter + 1:]

        mydictionary[slugify(key)] = slugify(value)

    return mydictionary


def generate_filename(doc):
    path = ""

    try:
        if settings.PAPERLESS_FILENAME_FORMAT is not None:
            tags = defaultdict(lambda: slugify(None),
                               many_to_dictionary(doc.tags))

            if doc.correspondent:
                correspondent = pathvalidate.sanitize_filename(
                    doc.correspondent.name, replacement_text="-"
                )
            else:
                correspondent = "none"

            if doc.document_type:
                document_type = pathvalidate.sanitize_filename(
                    doc.document_type.name, replacement_text="-"
                )
            else:
                document_type = "none"

            path = settings.PAPERLESS_FILENAME_FORMAT.format(
                title=pathvalidate.sanitize_filename(
                    doc.title, replacement_text="-"),
                correspondent=correspondent,
                document_type=document_type,
                created=datetime.date.isoformat(doc.created),
                created_year=doc.created.year if doc.created else "none",
                created_month=doc.created.month if doc.created else "none",
                created_day=doc.created.day if doc.created else "none",
                added=datetime.date.isoformat(doc.added),
                added_year=doc.added.year if doc.added else "none",
                added_month=doc.added.month if doc.added else "none",
                added_day=doc.added.day if doc.added else "none",
                tags=tags,
            )
    except (ValueError, KeyError, IndexError):
        logging.getLogger(__name__).warning(
            f"Invalid PAPERLESS_FILENAME_FORMAT: "
            f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")

    # Always append the primary key to guarantee uniqueness of filename
    if len(path) > 0:
        filename = "%s-%07i%s" % (path, doc.pk, doc.file_type)
    else:
        filename = "%07i%s" % (doc.pk, doc.file_type)

    # Append .gpg for encrypted files
    if doc.storage_type == doc.STORAGE_TYPE_GPG:
        filename += ".gpg"

    return filename


def archive_name_from_filename(filename):

    return os.path.splitext(filename)[0] + ".pdf"
changes to filename generation, partially addresses #90 2020-12-06 16:13:37 +01:00			`import datetime`
add exception handler for invalid filename formats. 2020-11-13 20:31:51 +01:00			`import logging`
fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. 2020-11-11 14:21:33 +01:00			`import os`
			`from collections import defaultdict`

changes to filename generation, partially addresses #90 2020-12-06 16:13:37 +01:00			`import pathvalidate`
fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. 2020-11-11 14:21:33 +01:00			`from django.conf import settings`
			`from django.template.defaultfilters import slugify`


			`def create_source_path_directory(source_path):`
			`os.makedirs(os.path.dirname(source_path), exist_ok=True)`


filename handling for archive files. 2020-11-30 21:38:21 +01:00			`def delete_empty_directories(directory, root):`
			`if not os.path.isdir(directory):`
			`return`

fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. 2020-11-11 14:21:33 +01:00			`# Go up in the directory hierarchy and try to delete all directories`
			`directory = os.path.normpath(directory)`
filename handling for archive files. 2020-11-30 21:38:21 +01:00			`root = os.path.normpath(root)`
fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. 2020-11-11 14:21:33 +01:00
			`if not directory.startswith(root + os.path.sep):`
			`# don't do anything outside our originals folder.`

			`# append os.path.set so that we avoid these cases:`
			`# directory = /home/originals2/test`
			`# root = /home/originals ("/" gets appended and startswith fails)`
			`return`

			`while directory != root:`
			`if not os.listdir(directory):`
			`# it's empty`
			`try:`
			`os.rmdir(directory)`
			`except OSError:`
			`# whatever. empty directories aren't that bad anyway.`
			`return`
			`else:`
			`# it's not empty.`
			`return`

			`# go one level up`
			`directory = os.path.normpath(os.path.dirname(directory))`


			`def many_to_dictionary(field):`
			`# Converts ManyToManyField to dictionary by assuming, that field`
			`# entries contain an _ or - which will be used as a delimiter`
			`mydictionary = dict()`

			`for index, t in enumerate(field.all()):`
			`# Populate tag names by index`
			`mydictionary[index] = slugify(t.name)`

			`# Find delimiter`
			`delimiter = t.name.find('_')`

			`if delimiter == -1:`
			`delimiter = t.name.find('-')`

			`if delimiter == -1:`
			`continue`

			`key = t.name[:delimiter]`
			`value = t.name[delimiter + 1:]`

			`mydictionary[slugify(key)] = slugify(value)`

			`return mydictionary`


code cleanup 2020-11-21 15:34:00 +01:00			`def generate_filename(doc):`
add exception handler for invalid filename formats. 2020-11-13 20:31:51 +01:00			`path = ""`

			`try:`
			`if settings.PAPERLESS_FILENAME_FORMAT is not None:`
			`tags = defaultdict(lambda: slugify(None),`
code cleanup 2020-11-21 15:34:00 +01:00			`many_to_dictionary(doc.tags))`
changes to filename generation, partially addresses #90 2020-12-06 16:13:37 +01:00
			`if doc.correspondent:`
			`correspondent = pathvalidate.sanitize_filename(`
			`doc.correspondent.name, replacement_text="-"`
			`)`
			`else:`
			`correspondent = "none"`

			`if doc.document_type:`
			`document_type = pathvalidate.sanitize_filename(`
			`doc.document_type.name, replacement_text="-"`
			`)`
			`else:`
			`document_type = "none"`

add exception handler for invalid filename formats. 2020-11-13 20:31:51 +01:00			`path = settings.PAPERLESS_FILENAME_FORMAT.format(`
changes to filename generation, partially addresses #90 2020-12-06 16:13:37 +01:00			`title=pathvalidate.sanitize_filename(`
			`doc.title, replacement_text="-"),`
			`correspondent=correspondent,`
			`document_type=document_type,`
			`created=datetime.date.isoformat(doc.created),`
code cleanup 2020-11-21 15:34:00 +01:00			`created_year=doc.created.year if doc.created else "none",`
			`created_month=doc.created.month if doc.created else "none",`
			`created_day=doc.created.day if doc.created else "none",`
changes to filename generation, partially addresses #90 2020-12-06 16:13:37 +01:00			`added=datetime.date.isoformat(doc.added),`
code cleanup 2020-11-21 15:34:00 +01:00			`added_year=doc.added.year if doc.added else "none",`
			`added_month=doc.added.month if doc.added else "none",`
			`added_day=doc.added.day if doc.added else "none",`
add exception handler for invalid filename formats. 2020-11-13 20:31:51 +01:00			`tags=tags,`
			`)`
codestyle 2020-11-18 22:41:14 +01:00			`except (ValueError, KeyError, IndexError):`
code cleanup 2020-11-21 14:03:45 +01:00			`logging.getLogger(__name__).warning(`
			`f"Invalid PAPERLESS_FILENAME_FORMAT: "`
			`f"{settings.PAPERLESS_FILENAME_FORMAT}, falling back to default")`
fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. 2020-11-11 14:21:33 +01:00
			`# Always append the primary key to guarantee uniqueness of filename`
			`if len(path) > 0:`
code cleanup 2020-11-21 15:34:00 +01:00			`filename = "%s-%07i%s" % (path, doc.pk, doc.file_type)`
fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. 2020-11-11 14:21:33 +01:00			`else:`
code cleanup 2020-11-21 15:34:00 +01:00			`filename = "%07i%s" % (doc.pk, doc.file_type)`
fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. 2020-11-11 14:21:33 +01:00
			`# Append .gpg for encrypted files`
code cleanup 2020-11-21 15:34:00 +01:00			`if doc.storage_type == doc.STORAGE_TYPE_GPG:`
fixed the file handling implementation. The feature is cool, but the original implementation had so many small flaws it wasn't even funny. 2020-11-11 14:21:33 +01:00			`filename += ".gpg"`

			`return filename`
filename handling for archive files. 2020-11-30 21:38:21 +01:00

			`def archive_name_from_filename(filename):`

			`return os.path.splitext(filename)[0] + ".pdf"`