paperless-ngx/src/documents/index.py

import logging
import os
from contextlib import contextmanager

from django.conf import settings
from whoosh import highlight
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.writing import AsyncWriter


logger = logging.getLogger(__name__)


class JsonFormatter(Formatter):
    def __init__(self):
        self.seen = {}

    def format_token(self, text, token, replace=False):
        seen = self.seen
        ttext = self._text(get_text(text, token, replace))
        if ttext in seen:
            termnum = seen[ttext]
        else:
            termnum = len(seen)
            seen[ttext] = termnum

        return {'text': ttext, 'term': termnum}

    def format_fragment(self, fragment, replace=False):
        output = []
        index = fragment.startchar
        text = fragment.text

        for t in fragment.matches:
            if t.startchar is None:
                continue
            if t.startchar < index:
                continue
            if t.startchar > index:
                output.append({'text': text[index:t.startchar]})
            output.append(self.format_token(text, t, replace))
            index = t.endchar
        if index < fragment.endchar:
            output.append({'text': text[index:fragment.endchar]})
        return output

    def format(self, fragments, replace=False):
        output = []
        for fragment in fragments:
            output.append(self.format_fragment(fragment, replace=replace))
        return output


def get_schema():
    return Schema(
        id=NUMERIC(stored=True, unique=True, numtype=int),
        title=TEXT(stored=True),
        content=TEXT(),
        correspondent=TEXT(stored=True),
        tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True)
    )


def open_index(recreate=False):
    try:
        if exists_in(settings.INDEX_DIR) and not recreate:
            return open_dir(settings.INDEX_DIR, schema=get_schema())
    except Exception as e:
        logger.error(f"Error while opening the index: {e}, recreating.")

    if not os.path.isdir(settings.INDEX_DIR):
        os.makedirs(settings.INDEX_DIR, exist_ok=True)
    return create_in(settings.INDEX_DIR, get_schema())


def update_document(writer, doc):
    logger.debug("Indexing {}...".format(doc))
    tags = ",".join([t.name for t in doc.tags.all()])
    writer.update_document(
        id=doc.pk,
        title=doc.title,
        content=doc.content,
        correspondent=doc.correspondent.name if doc.correspondent else None,
        tag=tags if tags else None
    )


def remove_document(writer, doc):
    logger.debug("Removing {} from index...".format(doc))
    writer.delete_by_term('id', doc.pk)


def add_or_update_document(document):
    ix = open_index()
    with AsyncWriter(ix) as writer:
        update_document(writer, document)


def remove_document_from_index(document):
    ix = open_index()
    with AsyncWriter(ix) as writer:
        remove_document(writer, document)


@contextmanager
def query_page(ix, query, page):
    searcher = ix.searcher()
    try:
        query_parser = MultifieldParser(
            ["content", "title", "correspondent", "tag"],
            ix.schema).parse(query)
        result_page = searcher.search_page(query_parser, page)
        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()

        corrected = searcher.correct_query(query_parser, query)
        if corrected.query != query_parser:
            corrected_query = corrected.string
        else:
            corrected_query = None

        yield result_page, corrected_query
    finally:
        searcher.close()


def autocomplete(ix, term, limit=10):
    with ix.reader() as reader:
        terms = []
        for (score, t) in reader.most_distinctive_terms(
                "content", number=limit, prefix=term.lower()):
            terms.append(t)
        return terms
updated settings: docker image runs without ENV variables 2020-11-03 12:23:24 +01:00			`import logging`
make the index dir if it does not exist. 2020-11-20 11:21:09 +01:00			`import os`
fixed an issue with the searcher. 2020-11-10 01:47:35 +01:00			`from contextlib import contextmanager`
updated settings: docker image runs without ENV variables 2020-11-03 12:23:24 +01:00
make the index dir if it does not exist. 2020-11-20 11:21:09 +01:00			`from django.conf import settings`
moved some code 2020-11-08 11:30:16 +01:00			`from whoosh import highlight`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`from whoosh.highlight import Formatter, get_text`
			`from whoosh.index import create_in, exists_in, open_dir`
moved some code 2020-11-08 11:30:16 +01:00			`from whoosh.qparser import MultifieldParser`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`from whoosh.writing import AsyncWriter`


I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`logger = logging.getLogger(__name__)`


added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`class JsonFormatter(Formatter):`
			`def __init__(self):`
			`self.seen = {}`

			`def format_token(self, text, token, replace=False):`
			`seen = self.seen`
			`ttext = self._text(get_text(text, token, replace))`
			`if ttext in seen:`
			`termnum = seen[ttext]`
			`else:`
			`termnum = len(seen)`
			`seen[ttext] = termnum`

			`return {'text': ttext, 'term': termnum}`

			`def format_fragment(self, fragment, replace=False):`
			`output = []`
			`index = fragment.startchar`
			`text = fragment.text`

			`for t in fragment.matches:`
			`if t.startchar is None:`
			`continue`
			`if t.startchar < index:`
			`continue`
			`if t.startchar > index:`
			`output.append({'text': text[index:t.startchar]})`
			`output.append(self.format_token(text, t, replace))`
			`index = t.endchar`
			`if index < fragment.endchar:`
			`output.append({'text': text[index:fragment.endchar]})`
			`return output`

			`def format(self, fragments, replace=False):`
			`output = []`
			`for fragment in fragments:`
			`output.append(self.format_fragment(fragment, replace=replace))`
			`return output`


			`def get_schema():`
			`return Schema(`
			`id=NUMERIC(stored=True, unique=True, numtype=int),`
			`title=TEXT(stored=True),`
add correspondent to the index 2020-11-06 17:27:21 +01:00			`content=TEXT(),`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`correspondent=TEXT(stored=True),`
			`tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True)`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`)`


			`def open_index(recreate=False):`
The index is now recreated in case loading fails. 2020-11-26 22:18:30 +01:00			`try:`
			`if exists_in(settings.INDEX_DIR) and not recreate:`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`return open_dir(settings.INDEX_DIR, schema=get_schema())`
The index is now recreated in case loading fails. 2020-11-26 22:18:30 +01:00			`except Exception as e:`
			`logger.error(f"Error while opening the index: {e}, recreating.")`

			`if not os.path.isdir(settings.INDEX_DIR):`
			`os.makedirs(settings.INDEX_DIR, exist_ok=True)`
			`return create_in(settings.INDEX_DIR, get_schema())`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00

			`def update_document(writer, doc):`
I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`logger.debug("Indexing {}...".format(doc))`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`tags = ",".join([t.name for t in doc.tags.all()])`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`writer.update_document(`
replaced usages of .id with .pk, fixed filename issue in exporter 2020-11-03 12:37:37 +01:00			`id=doc.pk,`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`title=doc.title,`
add correspondent to the index 2020-11-06 17:27:21 +01:00			`content=doc.content,`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`correspondent=doc.correspondent.name if doc.correspondent else None,`
			`tag=tags if tags else None`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`)`

added autocomplete to backend 2020-10-27 17:07:13 +01:00
I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`def remove_document(writer, doc):`
			`logger.debug("Removing {} from index...".format(doc))`
			`writer.delete_by_term('id', doc.pk)`


			`def add_or_update_document(document):`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`ix = open_index()`
			`with AsyncWriter(ix) as writer:`
I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`update_document(writer, document)`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00

I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`def remove_document_from_index(document):`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`ix = open_index()`
			`with AsyncWriter(ix) as writer:`
I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`remove_document(writer, document)`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00

fixed an issue with the searcher. 2020-11-10 01:47:35 +01:00			`@contextmanager`
moved some code 2020-11-08 11:30:16 +01:00			`def query_page(ix, query, page):`
fixed an issue with the searcher. 2020-11-10 01:47:35 +01:00			`searcher = ix.searcher()`
			`try:`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`query_parser = MultifieldParser(`
			`["content", "title", "correspondent", "tag"],`
			`ix.schema).parse(query)`
moved some code 2020-11-08 11:30:16 +01:00			`result_page = searcher.search_page(query_parser, page)`
			`result_page.results.fragmenter = highlight.ContextFragmenter(`
			`surround=50)`
			`result_page.results.formatter = JsonFormatter()`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00
			`corrected = searcher.correct_query(query_parser, query)`
			`if corrected.query != query_parser:`
			`corrected_query = corrected.string`
			`else:`
			`corrected_query = None`

			`yield result_page, corrected_query`
fixed an issue with the searcher. 2020-11-10 01:47:35 +01:00			`finally:`
			`searcher.close()`
moved some code 2020-11-08 11:30:16 +01:00

added autocomplete to backend 2020-10-27 17:07:13 +01:00			`def autocomplete(ix, term, limit=10):`
			`with ix.reader() as reader:`
			`terms = []`
code cleanup 2020-11-21 14:03:45 +01:00			`for (score, t) in reader.most_distinctive_terms(`
			`"content", number=limit, prefix=term.lower()):`
added autocomplete to backend 2020-10-27 17:07:13 +01:00			`terms.append(t)`
			`return terms`