paperless-ngx/src/documents/bulk_edit.py

335 lines
10 KiB
Python
Raw Normal View History

import hashlib
import itertools
import logging
import os
from typing import Optional
from celery import chord
from django.conf import settings
2020-12-11 14:27:54 +01:00
from django.db.models import Q
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.models import Correspondent
from documents.models import CustomFieldInstance
from documents.models import Document
from documents.models import DocumentType
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
from documents.models import StoragePath
2022-12-08 02:03:50 -08:00
from documents.permissions import set_permissions_for_object
from documents.tasks import bulk_update_documents
from documents.tasks import consume_file
from documents.tasks import update_document_archive_file
2020-11-30 13:58:40 +01:00
logger = logging.getLogger("paperless.bulk_edit")
2020-11-30 13:58:40 +01:00
def set_correspondent(doc_ids: list[int], correspondent):
2020-12-11 14:27:54 +01:00
if correspondent:
correspondent = Correspondent.objects.only("pk").get(id=correspondent)
2020-11-30 13:58:40 +01:00
qs = (
Document.objects.filter(Q(id__in=doc_ids) & ~Q(correspondent=correspondent))
.select_related("correspondent")
.only("pk", "correspondent__id")
)
affected_docs = list(qs.values_list("pk", flat=True))
2020-12-11 14:27:54 +01:00
qs.update(correspondent=correspondent)
2020-11-30 13:58:40 +01:00
bulk_update_documents.delay(document_ids=affected_docs)
2020-11-30 13:58:40 +01:00
2020-12-11 14:27:54 +01:00
return "OK"
2020-11-30 13:58:40 +01:00
def set_storage_path(doc_ids: list[int], storage_path):
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
if storage_path:
storage_path = StoragePath.objects.only("pk").get(id=storage_path)
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
qs = (
Document.objects.filter(
Q(id__in=doc_ids) & ~Q(storage_path=storage_path),
)
.select_related("storage_path")
.only("pk", "storage_path__id")
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
)
affected_docs = list(qs.values_list("pk", flat=True))
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
qs.update(storage_path=storage_path)
bulk_update_documents.delay(
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
document_ids=affected_docs,
)
return "OK"
def set_document_type(doc_ids: list[int], document_type):
2020-12-11 14:27:54 +01:00
if document_type:
document_type = DocumentType.objects.only("pk").get(id=document_type)
2020-11-30 13:58:40 +01:00
qs = (
Document.objects.filter(Q(id__in=doc_ids) & ~Q(document_type=document_type))
.select_related("document_type")
.only("pk", "document_type__id")
)
affected_docs = list(qs.values_list("pk", flat=True))
2020-12-11 14:27:54 +01:00
qs.update(document_type=document_type)
2020-11-30 13:58:40 +01:00
bulk_update_documents.delay(document_ids=affected_docs)
2020-11-30 13:58:40 +01:00
2020-12-11 14:27:54 +01:00
return "OK"
2020-12-06 14:39:53 +01:00
def add_tag(doc_ids: list[int], tag: int):
qs = Document.objects.filter(Q(id__in=doc_ids) & ~Q(tags__id=tag)).only("pk")
affected_docs = list(qs.values_list("pk", flat=True))
2020-12-11 14:27:54 +01:00
DocumentTagRelationship = Document.tags.through
2022-02-27 15:26:41 +01:00
DocumentTagRelationship.objects.bulk_create(
[DocumentTagRelationship(document_id=doc, tag_id=tag) for doc in affected_docs],
2022-02-27 15:26:41 +01:00
)
2020-12-11 14:27:54 +01:00
bulk_update_documents.delay(document_ids=affected_docs)
2020-12-11 14:27:54 +01:00
return "OK"
def remove_tag(doc_ids: list[int], tag: int):
qs = Document.objects.filter(Q(id__in=doc_ids) & Q(tags__id=tag)).only("pk")
affected_docs = list(qs.values_list("pk", flat=True))
2020-12-11 14:27:54 +01:00
DocumentTagRelationship = Document.tags.through
DocumentTagRelationship.objects.filter(
Q(document_id__in=affected_docs) & Q(tag_id=tag),
2020-12-11 14:27:54 +01:00
).delete()
bulk_update_documents.delay(document_ids=affected_docs)
2020-12-11 14:27:54 +01:00
return "OK"
def modify_tags(doc_ids: list[int], add_tags: list[int], remove_tags: list[int]):
qs = Document.objects.filter(id__in=doc_ids).only("pk")
affected_docs = list(qs.values_list("pk", flat=True))
DocumentTagRelationship = Document.tags.through
DocumentTagRelationship.objects.filter(
document_id__in=affected_docs,
tag_id__in=remove_tags,
).delete()
2022-02-27 15:26:41 +01:00
DocumentTagRelationship.objects.bulk_create(
[
DocumentTagRelationship(document_id=doc, tag_id=tag)
for (doc, tag) in itertools.product(affected_docs, add_tags)
],
ignore_conflicts=True,
)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
def modify_custom_fields(doc_ids: list[int], add_custom_fields, remove_custom_fields):
qs = Document.objects.filter(id__in=doc_ids).only("pk")
affected_docs = list(qs.values_list("pk", flat=True))
fields_to_add = []
for field in add_custom_fields:
for doc_id in affected_docs:
fields_to_add.append(
CustomFieldInstance(
document_id=doc_id,
field_id=field,
),
)
CustomFieldInstance.objects.bulk_create(fields_to_add)
CustomFieldInstance.objects.filter(
document_id__in=affected_docs,
field_id__in=remove_custom_fields,
).delete()
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
def delete(doc_ids: list[int]):
2020-12-11 14:27:54 +01:00
Document.objects.filter(id__in=doc_ids).delete()
2021-02-15 13:26:36 +01:00
from documents import index
with index.open_index_writer() as writer:
for id in doc_ids:
index.remove_document_by_id(writer, id)
2020-12-11 14:27:54 +01:00
return "OK"
2022-06-22 05:53:41 -07:00
def redo_ocr(doc_ids: list[int]):
for document_id in doc_ids:
update_document_archive_file.delay(
document_id=document_id,
)
2022-06-22 05:53:41 -07:00
return "OK"
2022-12-08 02:03:50 -08:00
def set_permissions(doc_ids: list[int], set_permissions, owner=None, merge=False):
qs = Document.objects.filter(id__in=doc_ids).select_related("owner")
2022-12-09 17:51:01 -08:00
if merge:
# If merging, only set owner for documents that don't have an owner
qs.filter(owner__isnull=True).update(owner=owner)
else:
qs.update(owner=owner)
2022-12-09 17:51:01 -08:00
2022-12-08 02:03:50 -08:00
for doc in qs:
set_permissions_for_object(permissions=set_permissions, object=doc, merge=merge)
2022-12-09 17:51:01 -08:00
affected_docs = list(qs.values_list("pk", flat=True))
2022-12-08 02:03:50 -08:00
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
def rotate(doc_ids: list[int], degrees: int):
logger.info(
f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.",
)
qs = Document.objects.filter(id__in=doc_ids)
affected_docs = []
import pikepdf
rotate_tasks = []
for doc in qs:
if doc.mime_type != "application/pdf":
logger.warning(
f"Document {doc.id} is not a PDF, skipping rotation.",
)
continue
try:
with pikepdf.open(doc.source_path, allow_overwriting_input=True) as pdf:
for page in pdf.pages:
page.rotate(degrees, relative=True)
pdf.save()
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
doc.save()
rotate_tasks.append(
update_document_archive_file.s(
document_id=doc.id,
),
)
logger.info(
f"Rotated document {doc.id} by {degrees} degrees",
)
affected_docs.append(doc.id)
except Exception as e:
logger.exception(f"Error rotating document {doc.id}: {e}")
if len(affected_docs) > 0:
bulk_update_task = bulk_update_documents.si(document_ids=affected_docs)
chord(header=rotate_tasks, body=bulk_update_task).delay()
return "OK"
def merge(doc_ids: list[int], metadata_document_id: Optional[int] = None):
logger.info(
f"Attempting to merge {len(doc_ids)} documents into a single document.",
)
qs = Document.objects.filter(id__in=doc_ids)
affected_docs = []
import pikepdf
merged_pdf = pikepdf.new()
version = merged_pdf.pdf_version
# use doc_ids to preserve order
for doc_id in doc_ids:
doc = qs.get(id=doc_id)
try:
with pikepdf.open(str(doc.source_path)) as pdf:
version = max(version, pdf.pdf_version)
merged_pdf.pages.extend(pdf.pages)
affected_docs.append(doc.id)
except Exception as e:
logger.exception(
f"Error merging document {doc.id}, it will not be included in the merge: {e}",
)
if len(affected_docs) == 0:
logger.warning("No documents were merged")
return "OK"
filepath = os.path.join(
settings.SCRATCH_DIR,
f"{'_'.join([str(doc_id) for doc_id in doc_ids])[:100]}_merged.pdf",
)
merged_pdf.remove_unreferenced_resources()
merged_pdf.save(filepath, min_version=version)
merged_pdf.close()
if metadata_document_id:
metadata_document = qs.get(id=metadata_document_id)
if metadata_document is not None:
overrides = DocumentMetadataOverrides.from_document(metadata_document)
overrides.title = metadata_document.title + " (merged)"
else:
overrides = DocumentMetadataOverrides()
logger.info("Adding merged document to the task queue.")
consume_file.delay(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
),
overrides,
)
return "OK"
def split(doc_ids: list[int], pages: list[list[int]]):
logger.info(
f"Attempting to split document {doc_ids[0]} into {len(pages)} documents",
)
doc = Document.objects.get(id=doc_ids[0])
import pikepdf
try:
with pikepdf.open(doc.source_path) as pdf:
for idx, split_doc in enumerate(pages):
dst = pikepdf.new()
for page in split_doc:
dst.pages.append(pdf.pages[page - 1])
filepath = os.path.join(
settings.SCRATCH_DIR,
f"{doc.id}_{split_doc[0]}-{split_doc[-1]}.pdf",
)
dst.remove_unreferenced_resources()
dst.save(filepath)
dst.close()
overrides = DocumentMetadataOverrides().from_document(doc)
overrides.title = f"{doc.title} (split {idx + 1})"
logger.info(
f"Adding split document with pages {split_doc} to the task queue.",
)
consume_file.delay(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
),
overrides,
)
except Exception as e:
logger.exception(f"Error splitting document {doc.id}: {e}")
return "OK"