paperless-ngx/src/documents/index.py

327 lines
10 KiB
Python
Raw Normal View History

import logging
import math
import os
2020-11-10 01:47:35 +01:00
from contextlib import contextmanager
from dateutil.parser import isoparse
from django.conf import settings
2023-01-04 19:06:06 -08:00
from documents.models import Comment
from documents.models import Document
from guardian.shortcuts import get_users_with_perms
from whoosh import classify
from whoosh import highlight
from whoosh import query
from whoosh.fields import BOOLEAN
from whoosh.fields import DATETIME
from whoosh.fields import KEYWORD
from whoosh.fields import NUMERIC
from whoosh.fields import Schema
from whoosh.fields import TEXT
2021-05-15 13:58:11 +02:00
from whoosh.highlight import HtmlFormatter
from whoosh.index import create_in
from whoosh.index import exists_in
from whoosh.index import open_dir
2020-11-08 11:30:16 +01:00
from whoosh.qparser import MultifieldParser
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh.searching import ResultsPage
from whoosh.searching import Searcher
from whoosh.writing import AsyncWriter
2021-02-05 01:10:29 +01:00
logger = logging.getLogger("paperless.index")
def get_schema():
return Schema(
2022-02-27 15:26:41 +01:00
id=NUMERIC(stored=True, unique=True),
title=TEXT(sortable=True),
2020-11-06 17:27:21 +01:00
content=TEXT(),
2022-02-27 15:26:41 +01:00
asn=NUMERIC(sortable=True),
correspondent=TEXT(sortable=True),
correspondent_id=NUMERIC(),
has_correspondent=BOOLEAN(),
2022-02-27 15:26:41 +01:00
tag=KEYWORD(commas=True, scorable=True, lowercase=True),
tag_id=KEYWORD(commas=True, scorable=True),
has_tag=BOOLEAN(),
2022-02-27 15:26:41 +01:00
type=TEXT(sortable=True),
type_id=NUMERIC(),
has_type=BOOLEAN(),
2022-02-27 15:26:41 +01:00
created=DATETIME(sortable=True),
modified=DATETIME(sortable=True),
added=DATETIME(sortable=True),
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
path=TEXT(sortable=True),
path_id=NUMERIC(),
has_path=BOOLEAN(),
2023-01-04 19:06:06 -08:00
comments=TEXT(),
owner=TEXT(),
owner_id=NUMERIC(),
has_owner=BOOLEAN(),
viewer_id=KEYWORD(commas=True),
)
def open_index(recreate=False):
try:
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR, schema=get_schema())
2021-02-11 22:16:41 +01:00
except Exception:
logger.exception("Error while opening the index, recreating.")
if not os.path.isdir(settings.INDEX_DIR):
os.makedirs(settings.INDEX_DIR, exist_ok=True)
return create_in(settings.INDEX_DIR, get_schema())
2021-02-15 13:26:36 +01:00
@contextmanager
2021-04-04 01:19:07 +02:00
def open_index_writer(optimize=False):
writer = AsyncWriter(open_index())
2021-02-15 13:26:36 +01:00
try:
yield writer
except Exception as e:
logger.exception(str(e))
writer.cancel()
finally:
writer.commit(optimize=optimize)
@contextmanager
2021-04-04 01:19:07 +02:00
def open_index_searcher():
searcher = open_index().searcher()
try:
yield searcher
finally:
searcher.close()
def update_document(writer, doc):
tags = ",".join([t.name for t in doc.tags.all()])
tags_ids = ",".join([str(t.id) for t in doc.tags.all()])
2023-01-04 19:06:06 -08:00
comments = ",".join([str(c.comment) for c in Comment.objects.filter(document=doc)])
users_with_perms = get_users_with_perms(
doc,
only_with_perms_in=["view_document"],
)
viewer_ids = ",".join([str(u.id) for u in users_with_perms])
writer.update_document(
id=doc.pk,
title=doc.title,
2020-11-06 17:27:21 +01:00
content=doc.content,
correspondent=doc.correspondent.name if doc.correspondent else None,
correspondent_id=doc.correspondent.id if doc.correspondent else None,
has_correspondent=doc.correspondent is not None,
tag=tags if tags else None,
tag_id=tags_ids if tags_ids else None,
has_tag=len(tags) > 0,
type=doc.document_type.name if doc.document_type else None,
type_id=doc.document_type.id if doc.document_type else None,
has_type=doc.document_type is not None,
created=doc.created,
added=doc.added,
2021-04-05 21:53:07 +02:00
asn=doc.archive_serial_number,
modified=doc.modified,
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
path=doc.storage_path.name if doc.storage_path else None,
path_id=doc.storage_path.id if doc.storage_path else None,
has_path=doc.storage_path is not None,
2023-01-04 19:06:06 -08:00
comments=comments,
owner=doc.owner.username if doc.owner else None,
owner_id=doc.owner.id if doc.owner else None,
has_owner=doc.owner is not None,
viewer_id=viewer_ids if viewer_ids else None,
)
2020-10-27 17:07:13 +01:00
def remove_document(writer, doc):
remove_document_by_id(writer, doc.pk)
def remove_document_by_id(writer, doc_id):
2022-02-27 15:26:41 +01:00
writer.delete_by_term("id", doc_id)
def add_or_update_document(document):
2021-02-15 13:26:36 +01:00
with open_index_writer() as writer:
update_document(writer, document)
def remove_document_from_index(document):
2021-02-15 13:26:36 +01:00
with open_index_writer() as writer:
remove_document(writer, document)
class DelayedQuery:
2021-05-15 13:58:11 +02:00
def _get_query(self):
raise NotImplementedError()
2021-05-15 13:58:11 +02:00
def _get_query_filter(self):
criterias = []
for k, v in self.query_params.items():
2022-02-27 15:26:41 +01:00
if k == "correspondent__id":
criterias.append(query.Term("correspondent_id", v))
elif k == "tags__id__all":
for tag_id in v.split(","):
2022-02-27 15:26:41 +01:00
criterias.append(query.Term("tag_id", tag_id))
elif k == "tags__id__none":
for tag_id in v.split(","):
criterias.append(query.Not(query.Term("tag_id", tag_id)))
2022-02-27 15:26:41 +01:00
elif k == "document_type__id":
criterias.append(query.Term("type_id", v))
elif k == "correspondent__isnull":
criterias.append(query.Term("has_correspondent", v == "false"))
2022-02-27 15:26:41 +01:00
elif k == "is_tagged":
criterias.append(query.Term("has_tag", v == "true"))
2022-02-27 15:26:41 +01:00
elif k == "document_type__isnull":
criterias.append(query.Term("has_type", v == "false"))
2022-02-27 15:26:41 +01:00
elif k == "created__date__lt":
criterias.append(
query.DateRange("created", start=None, end=isoparse(v)),
2022-02-27 15:26:41 +01:00
)
elif k == "created__date__gt":
criterias.append(
query.DateRange("created", start=isoparse(v), end=None),
2022-02-27 15:26:41 +01:00
)
elif k == "added__date__gt":
criterias.append(query.DateRange("added", start=isoparse(v), end=None))
elif k == "added__date__lt":
criterias.append(query.DateRange("added", start=None, end=isoparse(v)))
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
elif k == "storage_path__id":
criterias.append(query.Term("path_id", v))
elif k == "storage_path__isnull":
criterias.append(query.Term("has_path", v == "false"))
user_criterias = [query.Term("has_owner", False)]
if "user" in self.query_params:
user_criterias.append(query.Term("owner_id", self.query_params["user"]))
user_criterias.append(
query.Term("viewer_id", str(self.query_params["user"])),
)
if len(criterias) > 0:
criterias.append(query.Or(user_criterias))
return query.And(criterias)
else:
return query.Or(user_criterias)
2021-05-15 13:58:11 +02:00
def _get_query_sortedby(self):
2022-02-27 15:26:41 +01:00
if "ordering" not in self.query_params:
2021-05-15 13:58:11 +02:00
return None, False
2022-02-27 15:26:41 +01:00
field: str = self.query_params["ordering"]
2021-05-15 13:58:11 +02:00
sort_fields_map = {
"created": "created",
"modified": "modified",
"added": "added",
"title": "title",
"correspondent__name": "correspondent",
"document_type__name": "type",
2022-02-27 15:26:41 +01:00
"archive_serial_number": "asn",
2021-05-15 13:58:11 +02:00
}
2022-02-27 15:26:41 +01:00
if field.startswith("-"):
2021-05-15 13:58:11 +02:00
field = field[1:]
reverse = True
else:
reverse = False
2021-05-15 13:58:11 +02:00
if field not in sort_fields_map:
return None, False
else:
return sort_fields_map[field], reverse
def __init__(self, searcher: Searcher, query_params, page_size):
self.searcher = searcher
self.query_params = query_params
self.page_size = page_size
self.saved_results = dict()
2021-04-03 21:49:31 +02:00
self.first_score = None
def __len__(self):
page = self[0:1]
return len(page)
def __getitem__(self, item):
if item.start in self.saved_results:
return self.saved_results[item.start]
2021-05-15 13:58:11 +02:00
q, mask = self._get_query()
sortedby, reverse = self._get_query_sortedby()
page: ResultsPage = self.searcher.search_page(
q,
mask=mask,
2021-05-15 13:58:11 +02:00
filter=self._get_query_filter(),
pagenum=math.floor(item.start / self.page_size) + 1,
pagelen=self.page_size,
sortedby=sortedby,
2022-02-27 15:26:41 +01:00
reverse=reverse,
)
2022-02-27 15:26:41 +01:00
page.results.fragmenter = highlight.ContextFragmenter(surround=50)
page.results.formatter = HtmlFormatter(tagname="span", between=" ... ")
2022-02-27 15:26:41 +01:00
if not self.first_score and len(page.results) > 0 and sortedby is None:
2021-04-03 21:49:31 +02:00
self.first_score = page.results[0].score
2022-02-27 15:26:41 +01:00
page.results.top_n = list(
map(
lambda hit: (
(hit[0] / self.first_score) if self.first_score else None,
hit[1],
),
page.results.top_n,
),
2022-02-27 15:26:41 +01:00
)
2021-04-03 21:49:31 +02:00
self.saved_results[item.start] = page
return page
class DelayedFullTextQuery(DelayedQuery):
2021-05-15 13:58:11 +02:00
def _get_query(self):
2022-02-27 15:26:41 +01:00
q_str = self.query_params["query"]
qp = MultifieldParser(
2023-01-04 19:06:06 -08:00
["content", "title", "correspondent", "tag", "type", "comments"],
2022-02-27 15:26:41 +01:00
self.searcher.ixreader.schema,
)
qp.add_plugin(DateParserPlugin())
q = qp.parse(q_str)
corrected = self.searcher.correct_query(q, q_str)
if corrected.query != q:
corrected.query = corrected.string
return q, None
class DelayedMoreLikeThisQuery(DelayedQuery):
2021-05-15 13:58:11 +02:00
def _get_query(self):
2022-02-27 15:26:41 +01:00
more_like_doc_id = int(self.query_params["more_like_id"])
content = Document.objects.get(id=more_like_doc_id).content
docnum = self.searcher.document_number(id=more_like_doc_id)
kts = self.searcher.key_terms_from_text(
"content",
content,
numterms=20,
model=classify.Bo1Model,
normalize=False,
2022-02-27 15:26:41 +01:00
)
q = query.Or(
[query.Term("content", word, boost=weight) for word, weight in kts],
2022-02-27 15:26:41 +01:00
)
mask = {docnum}
return q, mask
2020-10-27 17:07:13 +01:00
def autocomplete(ix, term, limit=10):
with ix.reader() as reader:
terms = []
2020-11-21 14:03:45 +01:00
for (score, t) in reader.most_distinctive_terms(
"content",
number=limit,
prefix=term.lower(),
2022-02-27 15:26:41 +01:00
):
2020-10-27 17:07:13 +01:00
terms.append(t)
return terms