paperless-ngx/src/documents/index.py

177 lines
5.6 KiB
Python
Raw Normal View History

import logging
import os
2020-11-10 01:47:35 +01:00
from contextlib import contextmanager
from django.conf import settings
2020-12-17 21:36:21 +01:00
from whoosh import highlight, classify, query
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
2020-11-08 11:30:16 +01:00
from whoosh.qparser import MultifieldParser
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh.writing import AsyncWriter
2021-02-05 01:10:29 +01:00
logger = logging.getLogger("paperless.index")
class JsonFormatter(Formatter):
def __init__(self):
self.seen = {}
def format_token(self, text, token, replace=False):
ttext = self._text(get_text(text, token, replace))
return {'text': ttext, 'highlight': 'true'}
def format_fragment(self, fragment, replace=False):
output = []
index = fragment.startchar
text = fragment.text
amend_token = None
for t in fragment.matches:
if t.startchar is None:
continue
if t.startchar < index:
continue
if t.startchar > index:
text_inbetween = text[index:t.startchar]
if amend_token and t.startchar - index < 10:
amend_token['text'] += text_inbetween
else:
output.append({'text': text_inbetween,
'highlight': False})
amend_token = None
token = self.format_token(text, t, replace)
if amend_token:
amend_token['text'] += token['text']
else:
output.append(token)
amend_token = token
index = t.endchar
if index < fragment.endchar:
output.append({'text': text[index:fragment.endchar],
'highlight': False})
return output
def format(self, fragments, replace=False):
output = []
for fragment in fragments:
output.append(self.format_fragment(fragment, replace=replace))
return output
def get_schema():
return Schema(
id=NUMERIC(stored=True, unique=True, numtype=int),
title=TEXT(stored=True),
2020-11-06 17:27:21 +01:00
content=TEXT(),
correspondent=TEXT(stored=True),
tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
type=TEXT(stored=True),
created=DATETIME(stored=True, sortable=True),
modified=DATETIME(stored=True, sortable=True),
added=DATETIME(stored=True, sortable=True),
)
def open_index(recreate=False):
try:
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR, schema=get_schema())
2021-02-11 22:16:41 +01:00
except Exception:
logger.exception(f"Error while opening the index, recreating.")
if not os.path.isdir(settings.INDEX_DIR):
os.makedirs(settings.INDEX_DIR, exist_ok=True)
return create_in(settings.INDEX_DIR, get_schema())
def update_document(writer, doc):
tags = ",".join([t.name for t in doc.tags.all()])
writer.update_document(
id=doc.pk,
title=doc.title,
2020-11-06 17:27:21 +01:00
content=doc.content,
correspondent=doc.correspondent.name if doc.correspondent else None,
tag=tags if tags else None,
type=doc.document_type.name if doc.document_type else None,
created=doc.created,
added=doc.added,
modified=doc.modified,
)
2020-10-27 17:07:13 +01:00
def remove_document(writer, doc):
remove_document_by_id(writer, doc.pk)
def remove_document_by_id(writer, doc_id):
writer.delete_by_term('id', doc_id)
def add_or_update_document(document):
ix = open_index()
with AsyncWriter(ix) as writer:
update_document(writer, document)
def remove_document_from_index(document):
ix = open_index()
with AsyncWriter(ix) as writer:
remove_document(writer, document)
2020-11-10 01:47:35 +01:00
@contextmanager
2020-12-17 21:36:21 +01:00
def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
2020-11-10 01:47:35 +01:00
searcher = ix.searcher()
try:
2020-12-17 21:36:21 +01:00
if querystring:
qp = MultifieldParser(
["content", "title", "correspondent", "tag", "type"],
ix.schema)
qp.add_plugin(DateParserPlugin())
str_q = qp.parse(querystring)
corrected = searcher.correct_query(str_q, querystring)
else:
str_q = None
corrected = None
if more_like_doc_id:
docnum = searcher.document_number(id=more_like_doc_id)
2020-12-18 00:10:16 +01:00
kts = searcher.key_terms_from_text(
'content', more_like_doc_content, numterms=20,
model=classify.Bo1Model, normalize=False)
more_like_q = query.Or(
[query.Term('content', word, boost=weight)
for word, weight in kts])
result_page = searcher.search_page(
more_like_q, page, filter=str_q, mask={docnum})
2020-12-17 21:36:21 +01:00
elif str_q:
result_page = searcher.search_page(str_q, page)
else:
raise ValueError(
"Either querystring or more_like_doc_id is required."
)
2020-11-08 11:30:16 +01:00
result_page.results.fragmenter = highlight.ContextFragmenter(
surround=50)
result_page.results.formatter = JsonFormatter()
2020-12-17 21:36:21 +01:00
if corrected and corrected.query != str_q:
corrected_query = corrected.string
else:
corrected_query = None
yield result_page, corrected_query
2020-11-10 01:47:35 +01:00
finally:
searcher.close()
2020-11-08 11:30:16 +01:00
2020-10-27 17:07:13 +01:00
def autocomplete(ix, term, limit=10):
with ix.reader() as reader:
terms = []
2020-11-21 14:03:45 +01:00
for (score, t) in reader.most_distinctive_terms(
"content", number=limit, prefix=term.lower()):
2020-10-27 17:07:13 +01:00
terms.append(t)
return terms