2020-11-03 12:23:24 +01:00
|
|
|
import logging
|
|
|
|
|
|
2020-10-25 23:03:02 +01:00
|
|
|
from django.db import models
|
|
|
|
|
from django.dispatch import receiver
|
2020-11-02 12:23:50 +01:00
|
|
|
from whoosh.fields import Schema, TEXT, NUMERIC
|
2020-10-25 23:03:02 +01:00
|
|
|
from whoosh.highlight import Formatter, get_text
|
|
|
|
|
from whoosh.index import create_in, exists_in, open_dir
|
|
|
|
|
from whoosh.writing import AsyncWriter
|
|
|
|
|
|
|
|
|
|
from documents.models import Document
|
|
|
|
|
from paperless import settings
|
|
|
|
|
|
|
|
|
|
|
2020-11-08 11:24:57 +01:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
2020-10-25 23:03:02 +01:00
|
|
|
class JsonFormatter(Formatter):
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self.seen = {}
|
|
|
|
|
|
|
|
|
|
def format_token(self, text, token, replace=False):
|
|
|
|
|
seen = self.seen
|
|
|
|
|
ttext = self._text(get_text(text, token, replace))
|
|
|
|
|
if ttext in seen:
|
|
|
|
|
termnum = seen[ttext]
|
|
|
|
|
else:
|
|
|
|
|
termnum = len(seen)
|
|
|
|
|
seen[ttext] = termnum
|
|
|
|
|
|
|
|
|
|
return {'text': ttext, 'term': termnum}
|
|
|
|
|
|
|
|
|
|
def format_fragment(self, fragment, replace=False):
|
|
|
|
|
output = []
|
|
|
|
|
index = fragment.startchar
|
|
|
|
|
text = fragment.text
|
|
|
|
|
|
|
|
|
|
for t in fragment.matches:
|
|
|
|
|
if t.startchar is None:
|
|
|
|
|
continue
|
|
|
|
|
if t.startchar < index:
|
|
|
|
|
continue
|
|
|
|
|
if t.startchar > index:
|
|
|
|
|
output.append({'text': text[index:t.startchar]})
|
|
|
|
|
output.append(self.format_token(text, t, replace))
|
|
|
|
|
index = t.endchar
|
|
|
|
|
if index < fragment.endchar:
|
|
|
|
|
output.append({'text': text[index:fragment.endchar]})
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
def format(self, fragments, replace=False):
|
|
|
|
|
output = []
|
|
|
|
|
for fragment in fragments:
|
|
|
|
|
output.append(self.format_fragment(fragment, replace=replace))
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_schema():
|
|
|
|
|
return Schema(
|
|
|
|
|
id=NUMERIC(stored=True, unique=True, numtype=int),
|
|
|
|
|
title=TEXT(stored=True),
|
2020-11-06 17:27:21 +01:00
|
|
|
content=TEXT(),
|
|
|
|
|
correspondent=TEXT(stored=True)
|
2020-10-25 23:03:02 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def open_index(recreate=False):
|
|
|
|
|
if exists_in(settings.INDEX_DIR) and not recreate:
|
|
|
|
|
return open_dir(settings.INDEX_DIR)
|
|
|
|
|
else:
|
|
|
|
|
return create_in(settings.INDEX_DIR, get_schema())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_document(writer, doc):
|
2020-11-08 11:24:57 +01:00
|
|
|
logger.debug("Indexing {}...".format(doc))
|
2020-10-25 23:03:02 +01:00
|
|
|
writer.update_document(
|
2020-11-03 12:37:37 +01:00
|
|
|
id=doc.pk,
|
2020-10-25 23:03:02 +01:00
|
|
|
title=doc.title,
|
2020-11-06 17:27:21 +01:00
|
|
|
content=doc.content,
|
|
|
|
|
correspondent=doc.correspondent.name if doc.correspondent else None
|
2020-10-25 23:03:02 +01:00
|
|
|
)
|
|
|
|
|
|
2020-10-27 17:07:13 +01:00
|
|
|
|
2020-11-08 11:24:57 +01:00
|
|
|
def remove_document(writer, doc):
|
|
|
|
|
logger.debug("Removing {} from index...".format(doc))
|
|
|
|
|
writer.delete_by_term('id', doc.pk)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def add_or_update_document(document):
|
2020-10-25 23:03:02 +01:00
|
|
|
ix = open_index()
|
|
|
|
|
with AsyncWriter(ix) as writer:
|
2020-11-08 11:24:57 +01:00
|
|
|
update_document(writer, document)
|
2020-10-25 23:03:02 +01:00
|
|
|
|
|
|
|
|
|
2020-11-08 11:24:57 +01:00
|
|
|
def remove_document_from_index(document):
|
2020-10-25 23:03:02 +01:00
|
|
|
ix = open_index()
|
|
|
|
|
with AsyncWriter(ix) as writer:
|
2020-11-08 11:24:57 +01:00
|
|
|
remove_document(writer, document)
|
2020-10-25 23:03:02 +01:00
|
|
|
|
|
|
|
|
|
2020-10-27 17:07:13 +01:00
|
|
|
def autocomplete(ix, term, limit=10):
|
|
|
|
|
with ix.reader() as reader:
|
|
|
|
|
terms = []
|
2020-10-29 14:34:02 +01:00
|
|
|
for (score, t) in reader.most_distinctive_terms("content", limit, term.lower()):
|
2020-10-27 17:07:13 +01:00
|
|
|
terms.append(t)
|
|
|
|
|
return terms
|