paperless-ngx/src/documents/views.py

673 lines
22 KiB
Python
Raw Normal View History

import logging
2020-11-25 14:48:36 +01:00
import os
import tempfile
2021-01-26 00:51:20 +01:00
import uuid
import zipfile
from datetime import datetime
from time import mktime
2020-11-25 14:48:36 +01:00
from django.conf import settings
2020-12-27 12:43:05 +01:00
from django.db.models import Count, Max, Case, When, IntegerField
2020-12-28 15:59:06 +01:00
from django.db.models.functions import Lower
from django.http import HttpResponse, HttpResponseBadRequest, Http404
from django.utils.translation import get_language
from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from django_q.tasks import async_task
from rest_framework import parsers
from rest_framework.decorators import action
from rest_framework.filters import OrderingFilter, SearchFilter
2016-03-01 18:57:12 +00:00
from rest_framework.mixins import (
DestroyModelMixin,
ListModelMixin,
RetrieveModelMixin,
UpdateModelMixin
)
2016-03-01 18:57:12 +00:00
from rest_framework.permissions import IsAuthenticated
2020-11-12 21:09:45 +01:00
from rest_framework.response import Response
from rest_framework.views import APIView
2016-03-01 18:57:12 +00:00
from rest_framework.viewsets import (
GenericViewSet,
ModelViewSet,
2021-02-06 17:02:00 +01:00
ViewSet
)
2020-11-12 21:09:45 +01:00
from paperless.db import GnuPG
from paperless.views import StandardPagination
from .bulk_download import OriginalAndArchiveStrategy, OriginalsOnlyStrategy, \
ArchiveOnlyStrategy
from .classifier import load_classifier
2018-09-26 10:51:42 +02:00
from .filters import (
CorrespondentFilterSet,
DocumentFilterSet,
TagFilterSet,
2021-02-06 17:02:00 +01:00
DocumentTypeFilterSet
2018-09-26 10:51:42 +02:00
)
2021-01-29 16:45:23 +01:00
from .matching import match_correspondents, match_tags, match_document_types
2021-02-06 17:02:00 +01:00
from .models import Correspondent, Document, Tag, DocumentType, SavedView
from .parsers import get_parser_class_for_mime_type
2016-03-01 18:57:12 +00:00
from .serialisers import (
CorrespondentSerializer,
DocumentSerializer,
2021-02-24 23:54:19 +01:00
TagSerializerVersion1,
2018-09-05 15:25:14 +02:00
TagSerializer,
DocumentTypeSerializer,
2020-12-06 14:39:53 +01:00
PostDocumentSerializer,
SavedViewSerializer,
BulkEditSerializer,
DocumentListSerializer,
BulkDownloadSerializer
2018-09-25 16:09:33 +02:00
)
2016-01-01 16:13:59 +00:00
2021-02-05 01:10:29 +01:00
logger = logging.getLogger("paperless.api")
2016-03-03 18:09:10 +00:00
class IndexView(TemplateView):
template_name = "index.html"
2016-02-16 09:28:34 +00:00
2021-01-02 01:19:01 +01:00
def get_language(self):
# This is here for the following reason:
# Django identifies languages in the form "en-us"
# However, angular generates locales as "en-US".
# this translates between these two forms.
lang = get_language()
if "-" in lang:
first = lang[:lang.index("-")]
second = lang[lang.index("-")+1:]
return f"{first}-{second.upper()}"
else:
return lang
2020-12-17 21:46:56 +01:00
def get_context_data(self, **kwargs):
context = super().get_context_data(**kwargs)
context['cookie_prefix'] = settings.COOKIE_PREFIX
context['username'] = self.request.user.username
context['full_name'] = self.request.user.get_full_name()
2021-01-02 01:19:01 +01:00
context['styles_css'] = f"frontend/{self.get_language()}/styles.css"
context['runtime_js'] = f"frontend/{self.get_language()}/runtime.js"
context['polyfills_js'] = f"frontend/{self.get_language()}/polyfills.js" # NOQA: E501
context['main_js'] = f"frontend/{self.get_language()}/main.js"
2021-01-07 00:08:34 +01:00
context['webmanifest'] = f"frontend/{self.get_language()}/manifest.webmanifest" # NOQA: E501
2021-02-25 23:23:26 +01:00
context['apple_touch_icon'] = f"frontend/{self.get_language()}/apple-touch-icon.png" # NOQA: E501
2020-12-17 21:46:56 +01:00
return context
2016-02-16 09:28:34 +00:00
class CorrespondentViewSet(ModelViewSet):
model = Correspondent
2020-11-21 14:03:45 +01:00
queryset = Correspondent.objects.annotate(
document_count=Count('documents'),
2020-12-28 15:59:06 +01:00
last_correspondence=Max('documents__created')).order_by(Lower('name'))
2020-11-21 14:03:45 +01:00
serializer_class = CorrespondentSerializer
2016-02-21 00:55:38 +00:00
pagination_class = StandardPagination
2016-03-01 18:57:12 +00:00
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
2020-11-17 22:31:43 +01:00
filterset_class = CorrespondentFilterSet
2020-11-21 14:03:45 +01:00
ordering_fields = (
"name",
"matching_algorithm",
"match",
"document_count",
"last_correspondence")
2016-02-16 09:28:34 +00:00
class TagViewSet(ModelViewSet):
model = Tag
2020-11-21 14:03:45 +01:00
queryset = Tag.objects.annotate(
2020-12-28 15:59:06 +01:00
document_count=Count('documents')).order_by(Lower('name'))
2020-11-21 14:03:45 +01:00
2021-02-24 23:54:19 +01:00
def get_serializer_class(self):
if int(self.request.version) == 1:
return TagSerializerVersion1
else:
return TagSerializer
2016-02-21 00:55:38 +00:00
pagination_class = StandardPagination
2016-03-01 18:57:12 +00:00
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
2020-11-17 22:31:43 +01:00
filterset_class = TagFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
2016-02-16 09:28:34 +00:00
2018-09-05 15:25:14 +02:00
class DocumentTypeViewSet(ModelViewSet):
model = DocumentType
2020-11-21 14:03:45 +01:00
queryset = DocumentType.objects.annotate(
2020-12-28 15:59:06 +01:00
document_count=Count('documents')).order_by(Lower('name'))
2020-11-21 14:03:45 +01:00
2018-09-05 15:25:14 +02:00
serializer_class = DocumentTypeSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, OrderingFilter)
2020-11-17 22:31:43 +01:00
filterset_class = DocumentTypeFilterSet
ordering_fields = ("name", "matching_algorithm", "match", "document_count")
2018-09-05 15:25:14 +02:00
2016-03-01 18:57:12 +00:00
class DocumentViewSet(RetrieveModelMixin,
UpdateModelMixin,
DestroyModelMixin,
ListModelMixin,
GenericViewSet):
2016-02-16 09:28:34 +00:00
model = Document
2016-02-21 00:55:38 +00:00
queryset = Document.objects.all()
2016-02-16 09:28:34 +00:00
serializer_class = DocumentSerializer
2016-02-21 00:55:38 +00:00
pagination_class = StandardPagination
2016-03-01 18:57:12 +00:00
permission_classes = (IsAuthenticated,)
filter_backends = (DjangoFilterBackend, SearchFilter, OrderingFilter)
2020-11-17 22:31:43 +01:00
filterset_class = DocumentFilterSet
2016-03-09 01:05:46 +00:00
search_fields = ("title", "correspondent__name", "content")
2016-03-13 16:45:12 +00:00
ordering_fields = (
2020-11-21 14:03:45 +01:00
"id",
"title",
"correspondent__name",
"document_type__name",
"created",
"modified",
"added",
"archive_serial_number")
def get_queryset(self):
return Document.objects.distinct()
def get_serializer(self, *args, **kwargs):
fields_param = self.request.query_params.get('fields', None)
if fields_param:
fields = fields_param.split(",")
else:
fields = None
serializer_class = self.get_serializer_class()
kwargs.setdefault('context', self.get_serializer_context())
kwargs.setdefault('fields', fields)
return serializer_class(*args, **kwargs)
def update(self, request, *args, **kwargs):
2020-11-21 14:03:45 +01:00
response = super(DocumentViewSet, self).update(
request, *args, **kwargs)
2021-02-15 13:26:36 +01:00
from documents import index
index.add_or_update_document(self.get_object())
return response
def destroy(self, request, *args, **kwargs):
2021-02-15 13:26:36 +01:00
from documents import index
index.remove_document_from_index(self.get_object())
return super(DocumentViewSet, self).destroy(request, *args, **kwargs)
2020-11-25 14:48:36 +01:00
@staticmethod
def original_requested(request):
return (
'original' in request.query_params and
request.query_params['original'] == 'true'
)
2020-11-25 14:48:36 +01:00
def file_response(self, pk, request, disposition):
doc = Document.objects.get(id=pk)
if not self.original_requested(request) and doc.has_archive_version: # NOQA: E501
2020-11-25 14:48:36 +01:00
file_handle = doc.archive_file
filename = doc.get_public_filename(archive=True)
2020-11-25 14:48:36 +01:00
mime_type = 'application/pdf'
else:
2020-11-25 20:16:27 +01:00
file_handle = doc.source_file
filename = doc.get_public_filename()
2020-11-25 20:16:27 +01:00
mime_type = doc.mime_type
2020-11-25 20:16:27 +01:00
if doc.storage_type == Document.STORAGE_TYPE_GPG:
file_handle = GnuPG.decrypted(file_handle)
2020-11-25 14:48:36 +01:00
response = HttpResponse(file_handle, content_type=mime_type)
response["Content-Disposition"] = '{}; filename="{}"'.format(
disposition, filename)
return response
def get_metadata(self, file, mime_type):
2020-12-08 15:28:09 +01:00
if not os.path.isfile(file):
return None
parser_class = get_parser_class_for_mime_type(mime_type)
if parser_class:
2021-01-04 23:05:16 +01:00
parser = parser_class(progress_callback=None, logging_group=None)
try:
return parser.extract_metadata(file, mime_type)
except Exception as e:
# TODO: cover GPG errors, remove later.
return []
else:
return []
2020-12-08 15:28:09 +01:00
def get_filesize(self, filename):
if os.path.isfile(filename):
return os.stat(filename).st_size
else:
return None
@action(methods=['get'], detail=True)
def metadata(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
2021-01-29 16:45:23 +01:00
except Document.DoesNotExist:
raise Http404()
2020-12-08 16:09:47 +01:00
2021-01-29 16:45:23 +01:00
meta = {
"original_checksum": doc.checksum,
"original_size": self.get_filesize(doc.source_path),
2021-01-29 16:45:23 +01:00
"original_mime_type": doc.mime_type,
"media_filename": doc.filename,
"has_archive_version": doc.has_archive_version,
2021-01-29 16:45:23 +01:00
"original_metadata": self.get_metadata(
doc.source_path, doc.mime_type),
"archive_checksum": doc.archive_checksum,
"archive_media_filename": doc.archive_filename
2021-01-29 16:45:23 +01:00
}
if doc.has_archive_version:
meta['archive_size'] = self.get_filesize(doc.archive_path)
2021-01-29 16:45:23 +01:00
meta['archive_metadata'] = self.get_metadata(
doc.archive_path, "application/pdf")
else:
meta['archive_size'] = None
meta['archive_metadata'] = None
return Response(meta)
2020-12-08 16:09:47 +01:00
2021-01-29 16:45:23 +01:00
@action(methods=['get'], detail=True)
def suggestions(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
classifier = load_classifier()
2021-01-29 16:45:23 +01:00
return Response({
"correspondents": [
c.id for c in match_correspondents(doc, classifier)
],
"tags": [t.id for t in match_tags(doc, classifier)],
"document_types": [
dt.id for dt in match_document_types(doc, classifier)
]
})
@action(methods=['get'], detail=True)
def preview(self, request, pk=None):
try:
2020-11-25 14:48:36 +01:00
response = self.file_response(
pk, request, "inline")
return response
except (FileNotFoundError, Document.DoesNotExist):
raise Http404()
@action(methods=['get'], detail=True)
@cache_control(public=False, max_age=315360000)
def thumb(self, request, pk=None):
try:
doc = Document.objects.get(id=pk)
if doc.storage_type == Document.STORAGE_TYPE_GPG:
handle = GnuPG.decrypted(doc.thumbnail_file)
else:
handle = doc.thumbnail_file
2021-02-09 21:53:10 +01:00
# TODO: Send ETag information and use that to send new thumbnails
# if available
return HttpResponse(handle,
2020-11-21 14:03:45 +01:00
content_type='image/png')
except (FileNotFoundError, Document.DoesNotExist):
raise Http404()
@action(methods=['get'], detail=True)
def download(self, request, pk=None):
try:
2020-11-25 14:48:36 +01:00
return self.file_response(
pk, request, "attachment")
except (FileNotFoundError, Document.DoesNotExist):
raise Http404()
2016-03-01 18:57:12 +00:00
2021-02-06 17:02:00 +01:00
class LogViewSet(ViewSet):
2020-11-02 01:24:56 +01:00
2016-03-01 18:57:12 +00:00
permission_classes = (IsAuthenticated,)
2021-02-06 17:02:00 +01:00
log_files = ["paperless", "mail"]
def retrieve(self, request, pk=None, *args, **kwargs):
2021-02-06 17:21:32 +01:00
if pk not in self.log_files:
2021-02-06 17:02:00 +01:00
raise Http404()
filename = os.path.join(settings.LOGGING_DIR, f"{pk}.log")
if not os.path.isfile(filename):
raise Http404()
with open(filename, "r") as f:
2021-02-06 17:21:32 +01:00
lines = [line.rstrip() for line in f.readlines()]
2021-02-06 17:02:00 +01:00
return Response(lines)
def list(self, request, *args, **kwargs):
return Response(self.log_files)
2020-12-12 15:46:56 +01:00
class SavedViewViewSet(ModelViewSet):
model = SavedView
queryset = SavedView.objects.all()
serializer_class = SavedViewSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
def get_queryset(self):
user = self.request.user
return SavedView.objects.filter(user=user)
def perform_create(self, serializer):
serializer.save(user=self.request.user)
2020-12-06 14:39:53 +01:00
class BulkEditView(APIView):
permission_classes = (IsAuthenticated,)
serializer_class = BulkEditSerializer
parser_classes = (parsers.JSONParser,)
def get_serializer_context(self):
return {
'request': self.request,
'format': self.format_kwarg,
'view': self
}
def get_serializer(self, *args, **kwargs):
kwargs['context'] = self.get_serializer_context()
return self.serializer_class(*args, **kwargs)
def post(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
2020-12-11 14:30:18 +01:00
method = serializer.validated_data.get("method")
parameters = serializer.validated_data.get("parameters")
documents = serializer.validated_data.get("documents")
try:
# TODO: parameter validation
result = method(documents, **parameters)
return Response({"result": result})
except Exception as e:
return HttpResponseBadRequest(str(e))
2020-12-06 14:39:53 +01:00
class PostDocumentView(APIView):
permission_classes = (IsAuthenticated,)
serializer_class = PostDocumentSerializer
parser_classes = (parsers.MultiPartParser,)
def get_serializer_context(self):
return {
'request': self.request,
'format': self.format_kwarg,
'view': self
}
def get_serializer(self, *args, **kwargs):
kwargs['context'] = self.get_serializer_context()
return self.serializer_class(*args, **kwargs)
def post(self, request, *args, **kwargs):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
doc_name, doc_data = serializer.validated_data.get('document')
correspondent_id = serializer.validated_data.get('correspondent')
document_type_id = serializer.validated_data.get('document_type')
tag_ids = serializer.validated_data.get('tags')
title = serializer.validated_data.get('title')
t = int(mktime(datetime.now().timetuple()))
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
with tempfile.NamedTemporaryFile(prefix="paperless-upload-",
dir=settings.SCRATCH_DIR,
delete=False) as f:
f.write(doc_data)
os.utime(f.name, times=(t, t))
temp_filename = f.name
task_id = str(uuid.uuid4())
async_task("documents.tasks.consume_file",
temp_filename,
override_filename=doc_name,
override_title=title,
override_correspondent_id=correspondent_id,
override_document_type_id=document_type_id,
override_tag_ids=tag_ids,
task_id=task_id,
task_name=os.path.basename(doc_name)[:100])
return Response("OK")
2020-12-27 12:43:05 +01:00
class SelectionDataView(APIView):
permission_classes = (IsAuthenticated,)
serializer_class = DocumentListSerializer
2020-12-27 12:43:05 +01:00
parser_classes = (parsers.MultiPartParser, parsers.JSONParser)
def get_serializer_context(self):
return {
'request': self.request,
'format': self.format_kwarg,
'view': self
}
def get_serializer(self, *args, **kwargs):
kwargs['context'] = self.get_serializer_context()
return self.serializer_class(*args, **kwargs)
def post(self, request, format=None):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
ids = serializer.validated_data.get('documents')
2020-12-27 12:54:47 +01:00
correspondents = Correspondent.objects.annotate(
document_count=Count(Case(
When(documents__id__in=ids, then=1),
output_field=IntegerField()
)))
2020-12-27 12:43:05 +01:00
2020-12-27 12:54:47 +01:00
tags = Tag.objects.annotate(document_count=Count(Case(
2020-12-27 12:43:05 +01:00
When(documents__id__in=ids, then=1),
output_field=IntegerField()
)))
2020-12-27 12:54:47 +01:00
types = DocumentType.objects.annotate(document_count=Count(Case(
2020-12-27 12:43:05 +01:00
When(documents__id__in=ids, then=1),
output_field=IntegerField()
)))
r = Response({
"selected_correspondents": [{
"id": t.id,
2020-12-27 12:54:47 +01:00
"document_count": t.document_count
2020-12-27 12:43:05 +01:00
} for t in correspondents],
"selected_tags": [{
"id": t.id,
2020-12-27 12:54:47 +01:00
"document_count": t.document_count
2020-12-27 12:43:05 +01:00
} for t in tags],
2020-12-28 12:31:14 +01:00
"selected_document_types": [{
2020-12-27 12:43:05 +01:00
"id": t.id,
2020-12-27 12:54:47 +01:00
"document_count": t.document_count
2020-12-27 12:43:05 +01:00
} for t in types]
})
return r
class SearchView(APIView):
2020-10-27 17:07:13 +01:00
permission_classes = (IsAuthenticated,)
2020-11-02 12:23:50 +01:00
def add_infos_to_hit(self, r):
try:
doc = Document.objects.get(id=r['id'])
except Document.DoesNotExist:
2021-02-05 01:10:29 +01:00
logger.warning(
f"Search index returned a non-existing document: "
f"id: {r['id']}, title: {r['title']}. "
f"Search index needs reindex."
)
doc = None
2020-11-02 12:23:50 +01:00
return {'id': r['id'],
'highlights': r.highlights("content", text=doc.content) if doc else None, # NOQA: E501
2020-11-02 12:23:50 +01:00
'score': r.score,
'rank': r.rank,
'document': DocumentSerializer(doc).data if doc else None,
2020-11-02 12:23:50 +01:00
'title': r['title']
}
def get(self, request, format=None):
2021-02-15 13:26:36 +01:00
from documents import index
2020-12-17 21:36:21 +01:00
if 'query' in request.query_params:
query = request.query_params['query']
else:
query = None
if 'more_like' in request.query_params:
more_like_id = request.query_params['more_like']
more_like_content = Document.objects.get(id=more_like_id).content
else:
more_like_id = None
more_like_content = None
2020-12-17 23:24:28 +01:00
if not query and not more_like_id:
return Response({
'count': 0,
'page': 0,
'page_count': 0,
2020-12-17 23:24:28 +01:00
'corrected_query': None,
'results': []})
2020-11-02 12:23:50 +01:00
try:
page = int(request.query_params.get('page', 1))
except (ValueError, TypeError):
page = 1
2020-11-27 15:00:16 +01:00
if page < 1:
page = 1
2020-11-27 15:00:16 +01:00
2021-02-15 13:26:36 +01:00
ix = index.open_index()
try:
2021-02-15 13:26:36 +01:00
with index.query_page(ix, page, query, more_like_id, more_like_content) as (result_page, corrected_query): # NOQA: E501
2020-11-10 01:47:35 +01:00
return Response(
{'count': len(result_page),
'page': result_page.pagenum,
'page_count': result_page.pagecount,
'corrected_query': corrected_query,
2020-11-10 01:47:35 +01:00
'results': list(map(self.add_infos_to_hit, result_page))})
except Exception as e:
return HttpResponseBadRequest(str(e))
2020-10-27 17:07:13 +01:00
class SearchAutoCompleteView(APIView):
permission_classes = (IsAuthenticated,)
def get(self, request, format=None):
if 'term' in request.query_params:
term = request.query_params['term']
else:
2020-11-17 14:20:28 +01:00
return HttpResponseBadRequest("Term required")
2020-10-27 17:07:13 +01:00
if 'limit' in request.query_params:
limit = int(request.query_params['limit'])
2020-11-17 14:20:28 +01:00
if limit <= 0:
return HttpResponseBadRequest("Invalid limit")
2020-10-27 17:07:13 +01:00
else:
limit = 10
2021-02-15 13:26:36 +01:00
from documents import index
ix = index.open_index()
return Response(index.autocomplete(ix, term, limit))
2020-10-31 00:56:20 +01:00
class StatisticsView(APIView):
permission_classes = (IsAuthenticated,)
def get(self, request, format=None):
documents_total = Document.objects.all().count()
if Tag.objects.filter(is_inbox_tag=True).exists():
documents_inbox = Document.objects.filter(
2020-11-21 14:03:45 +01:00
tags__is_inbox_tag=True).distinct().count()
else:
documents_inbox = None
return Response({
'documents_total': documents_total,
'documents_inbox': documents_inbox,
2020-10-31 00:56:20 +01:00
})
class BulkDownloadView(APIView):
permission_classes = (IsAuthenticated,)
serializer_class = BulkDownloadSerializer
parser_classes = (parsers.JSONParser,)
def get_serializer_context(self):
return {
'request': self.request,
'format': self.format_kwarg,
'view': self
}
def get_serializer(self, *args, **kwargs):
kwargs['context'] = self.get_serializer_context()
return self.serializer_class(*args, **kwargs)
def post(self, request, format=None):
serializer = self.get_serializer(data=request.data)
serializer.is_valid(raise_exception=True)
ids = serializer.validated_data.get('documents')
compression = serializer.validated_data.get('compression')
content = serializer.validated_data.get('content')
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
2021-02-21 00:21:43 +01:00
temp = tempfile.NamedTemporaryFile(
dir=settings.SCRATCH_DIR,
suffix="-compressed-archive",
delete=False)
if content == 'both':
strategy_class = OriginalAndArchiveStrategy
elif content == 'originals':
strategy_class = OriginalsOnlyStrategy
else:
strategy_class = ArchiveOnlyStrategy
with zipfile.ZipFile(temp.name, "w", compression) as zipf:
strategy = strategy_class(zipf)
for id in ids:
doc = Document.objects.get(id=id)
strategy.add_document(doc)
with open(temp.name, "rb") as f:
response = HttpResponse(f, content_type="application/zip")
response["Content-Disposition"] = '{}; filename="{}"'.format(
"attachment", "documents.zip")
return response