paperless-ngx/src/documents/index.py

import logging
import os
from contextlib import contextmanager

from django.conf import settings
from whoosh import highlight, classify, query
from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME
from whoosh.highlight import Formatter, get_text
from whoosh.index import create_in, exists_in, open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh.writing import AsyncWriter


logger = logging.getLogger("paperless.index")


class JsonFormatter(Formatter):
    def __init__(self):
        self.seen = {}

    def format_token(self, text, token, replace=False):
        ttext = self._text(get_text(text, token, replace))
        return {'text': ttext, 'highlight': 'true'}

    def format_fragment(self, fragment, replace=False):
        output = []
        index = fragment.startchar
        text = fragment.text
        amend_token = None
        for t in fragment.matches:
            if t.startchar is None:
                continue
            if t.startchar < index:
                continue
            if t.startchar > index:
                text_inbetween = text[index:t.startchar]
                if amend_token and t.startchar - index < 10:
                    amend_token['text'] += text_inbetween
                else:
                    output.append({'text': text_inbetween,
                                   'highlight': False})
                    amend_token = None
            token = self.format_token(text, t, replace)
            if amend_token:
                amend_token['text'] += token['text']
            else:
                output.append(token)
                amend_token = token
            index = t.endchar
        if index < fragment.endchar:
            output.append({'text': text[index:fragment.endchar],
                           'highlight': False})
        return output

    def format(self, fragments, replace=False):
        output = []
        for fragment in fragments:
            output.append(self.format_fragment(fragment, replace=replace))
        return output


def get_schema():
    return Schema(
        id=NUMERIC(stored=True, unique=True, numtype=int),
        title=TEXT(stored=True),
        content=TEXT(),
        correspondent=TEXT(stored=True),
        tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),
        type=TEXT(stored=True),
        created=DATETIME(stored=True, sortable=True),
        modified=DATETIME(stored=True, sortable=True),
        added=DATETIME(stored=True, sortable=True),
    )


def open_index(recreate=False):
    try:
        if exists_in(settings.INDEX_DIR) and not recreate:
            return open_dir(settings.INDEX_DIR, schema=get_schema())
    except Exception:
        logger.exception(f"Error while opening the index, recreating.")

    if not os.path.isdir(settings.INDEX_DIR):
        os.makedirs(settings.INDEX_DIR, exist_ok=True)
    return create_in(settings.INDEX_DIR, get_schema())


def update_document(writer, doc):
    tags = ",".join([t.name for t in doc.tags.all()])
    writer.update_document(
        id=doc.pk,
        title=doc.title,
        content=doc.content,
        correspondent=doc.correspondent.name if doc.correspondent else None,
        tag=tags if tags else None,
        type=doc.document_type.name if doc.document_type else None,
        created=doc.created,
        added=doc.added,
        modified=doc.modified,
    )


def remove_document(writer, doc):
    remove_document_by_id(writer, doc.pk)


def remove_document_by_id(writer, doc_id):
    writer.delete_by_term('id', doc_id)


def add_or_update_document(document):
    ix = open_index()
    with AsyncWriter(ix) as writer:
        update_document(writer, document)


def remove_document_from_index(document):
    ix = open_index()
    with AsyncWriter(ix) as writer:
        remove_document(writer, document)


@contextmanager
def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
    searcher = ix.searcher()
    try:
        if querystring:
            qp = MultifieldParser(
                ["content", "title", "correspondent", "tag", "type"],
                ix.schema)
            qp.add_plugin(DateParserPlugin())
            str_q = qp.parse(querystring)
            corrected = searcher.correct_query(str_q, querystring)
        else:
            str_q = None
            corrected = None

        if more_like_doc_id:
            docnum = searcher.document_number(id=more_like_doc_id)
            kts = searcher.key_terms_from_text(
                'content', more_like_doc_content, numterms=20,
                model=classify.Bo1Model, normalize=False)
            more_like_q = query.Or(
                [query.Term('content', word, boost=weight)
                 for word, weight in kts])
            result_page = searcher.search_page(
                more_like_q, page, filter=str_q, mask={docnum})
        elif str_q:
            result_page = searcher.search_page(str_q, page)
        else:
            raise ValueError(
                "Either querystring or more_like_doc_id is required."
            )

        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()

        if corrected and corrected.query != str_q:
            corrected_query = corrected.string
        else:
            corrected_query = None

        yield result_page, corrected_query
    finally:
        searcher.close()


def autocomplete(ix, term, limit=10):
    with ix.reader() as reader:
        terms = []
        for (score, t) in reader.most_distinctive_terms(
                "content", number=limit, prefix=term.lower()):
            terms.append(t)
        return terms
updated settings: docker image runs without ENV variables 2020-11-03 12:23:24 +01:00			`import logging`
make the index dir if it does not exist. 2020-11-20 11:21:09 +01:00			`import os`
fixed an issue with the searcher. 2020-11-10 01:47:35 +01:00			`from contextlib import contextmanager`
updated settings: docker image runs without ENV variables 2020-11-03 12:23:24 +01:00
make the index dir if it does not exist. 2020-11-20 11:21:09 +01:00			`from django.conf import settings`
more like this searching 2020-12-17 21:36:21 +01:00			`from whoosh import highlight, classify, query`
searching for types and dates, error catching, documentation and changelog. 2020-11-30 16:13:35 +01:00			`from whoosh.fields import Schema, TEXT, NUMERIC, KEYWORD, DATETIME`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`from whoosh.highlight import Formatter, get_text`
			`from whoosh.index import create_in, exists_in, open_dir`
moved some code 2020-11-08 11:30:16 +01:00			`from whoosh.qparser import MultifieldParser`
searching for types and dates, error catching, documentation and changelog. 2020-11-30 16:13:35 +01:00			`from whoosh.qparser.dateparse import DateParserPlugin`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`from whoosh.writing import AsyncWriter`


rework most of the logging 2021-02-05 01:10:29 +01:00			`logger = logging.getLogger("paperless.index")`
I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00

added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`class JsonFormatter(Formatter):`
			`def __init__(self):`
			`self.seen = {}`

			`def format_token(self, text, token, replace=False):`
			`ttext = self._text(get_text(text, token, replace))`
changed up the highlight fragment formatter 2020-12-18 16:42:33 +01:00			`return {'text': ttext, 'highlight': 'true'}`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00
			`def format_fragment(self, fragment, replace=False):`
			`output = []`
			`index = fragment.startchar`
			`text = fragment.text`
changed up the highlight fragment formatter 2020-12-18 16:42:33 +01:00			`amend_token = None`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`for t in fragment.matches:`
			`if t.startchar is None:`
			`continue`
			`if t.startchar < index:`
			`continue`
			`if t.startchar > index:`
changed up the highlight fragment formatter 2020-12-18 16:42:33 +01:00			`text_inbetween = text[index:t.startchar]`
			`if amend_token and t.startchar - index < 10:`
			`amend_token['text'] += text_inbetween`
			`else:`
			`output.append({'text': text_inbetween,`
			`'highlight': False})`
			`amend_token = None`
			`token = self.format_token(text, t, replace)`
			`if amend_token:`
			`amend_token['text'] += token['text']`
			`else:`
			`output.append(token)`
			`amend_token = token`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`index = t.endchar`
			`if index < fragment.endchar:`
changed up the highlight fragment formatter 2020-12-18 16:42:33 +01:00			`output.append({'text': text[index:fragment.endchar],`
			`'highlight': False})`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`return output`

			`def format(self, fragments, replace=False):`
			`output = []`
			`for fragment in fragments:`
			`output.append(self.format_fragment(fragment, replace=replace))`
			`return output`


			`def get_schema():`
			`return Schema(`
			`id=NUMERIC(stored=True, unique=True, numtype=int),`
			`title=TEXT(stored=True),`
add correspondent to the index 2020-11-06 17:27:21 +01:00			`content=TEXT(),`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`correspondent=TEXT(stored=True),`
searching for types and dates, error catching, documentation and changelog. 2020-11-30 16:13:35 +01:00			`tag=KEYWORD(stored=True, commas=True, scorable=True, lowercase=True),`
			`type=TEXT(stored=True),`
			`created=DATETIME(stored=True, sortable=True),`
			`modified=DATETIME(stored=True, sortable=True),`
			`added=DATETIME(stored=True, sortable=True),`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`)`


			`def open_index(recreate=False):`
The index is now recreated in case loading fails. 2020-11-26 22:18:30 +01:00			`try:`
			`if exists_in(settings.INDEX_DIR) and not recreate:`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`return open_dir(settings.INDEX_DIR, schema=get_schema())`
better exception logging 2021-02-11 22:16:41 +01:00			`except Exception:`
			`logger.exception(f"Error while opening the index, recreating.")`
The index is now recreated in case loading fails. 2020-11-26 22:18:30 +01:00
			`if not os.path.isdir(settings.INDEX_DIR):`
			`os.makedirs(settings.INDEX_DIR, exist_ok=True)`
			`return create_in(settings.INDEX_DIR, get_schema())`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00

			`def update_document(writer, doc):`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`tags = ",".join([t.name for t in doc.tags.all()])`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`writer.update_document(`
replaced usages of .id with .pk, fixed filename issue in exporter 2020-11-03 12:37:37 +01:00			`id=doc.pk,`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`title=doc.title,`
add correspondent to the index 2020-11-06 17:27:21 +01:00			`content=doc.content,`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`correspondent=doc.correspondent.name if doc.correspondent else None,`
searching for types and dates, error catching, documentation and changelog. 2020-11-30 16:13:35 +01:00			`tag=tags if tags else None,`
			`type=doc.document_type.name if doc.document_type else None,`
			`created=doc.created,`
			`added=doc.added,`
			`modified=doc.modified,`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`)`

added autocomplete to backend 2020-10-27 17:07:13 +01:00
I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`def remove_document(writer, doc):`
update index after bulk edit operations #195 2020-12-27 17:05:35 +01:00			`remove_document_by_id(writer, doc.pk)`


			`def remove_document_by_id(writer, doc_id):`
			`writer.delete_by_term('id', doc_id)`
I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00

			`def add_or_update_document(document):`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`ix = open_index()`
			`with AsyncWriter(ix) as writer:`
I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`update_document(writer, document)`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00

I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`def remove_document_from_index(document):`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00			`ix = open_index()`
			`with AsyncWriter(ix) as writer:`
I removed the model save/delete hooks for index updates since they were causing too much trouble with migrations 2020-11-08 11:24:57 +01:00			`remove_document(writer, document)`
added - document index - api access for thumbnails/downloads - more api filters updated - pipfile removed - filename handling - legacy thumb/download access - obsolete admin gui settings (per page items, FY, inline view) 2020-10-25 23:03:02 +01:00

fixed an issue with the searcher. 2020-11-10 01:47:35 +01:00			`@contextmanager`
more like this searching 2020-12-17 21:36:21 +01:00			`def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):`
fixed an issue with the searcher. 2020-11-10 01:47:35 +01:00			`searcher = ix.searcher()`
			`try:`
more like this searching 2020-12-17 21:36:21 +01:00			`if querystring:`
			`qp = MultifieldParser(`
			`["content", "title", "correspondent", "tag", "type"],`
			`ix.schema)`
			`qp.add_plugin(DateParserPlugin())`
			`str_q = qp.parse(querystring)`
			`corrected = searcher.correct_query(str_q, querystring)`
			`else:`
			`str_q = None`
			`corrected = None`

			`if more_like_doc_id:`
			`docnum = searcher.document_number(id=more_like_doc_id)`
code style 2020-12-18 00:10:16 +01:00			`kts = searcher.key_terms_from_text(`
			`'content', more_like_doc_content, numterms=20,`
			`model=classify.Bo1Model, normalize=False)`
			`more_like_q = query.Or(`
			`[query.Term('content', word, boost=weight)`
			`for word, weight in kts])`
			`result_page = searcher.search_page(`
			`more_like_q, page, filter=str_q, mask={docnum})`
more like this searching 2020-12-17 21:36:21 +01:00			`elif str_q:`
			`result_page = searcher.search_page(str_q, page)`
			`else:`
			`raise ValueError(`
			`"Either querystring or more_like_doc_id is required."`
			`)`
searching for types and dates, error catching, documentation and changelog. 2020-11-30 16:13:35 +01:00
moved some code 2020-11-08 11:30:16 +01:00			`result_page.results.fragmenter = highlight.ContextFragmenter(`
			`surround=50)`
			`result_page.results.formatter = JsonFormatter()`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00
more like this searching 2020-12-17 21:36:21 +01:00			`if corrected and corrected.query != str_q:`
searching for tags, spelling corrections fixes #74 2020-11-30 15:13:53 +01:00			`corrected_query = corrected.string`
			`else:`
			`corrected_query = None`

			`yield result_page, corrected_query`
fixed an issue with the searcher. 2020-11-10 01:47:35 +01:00			`finally:`
			`searcher.close()`
moved some code 2020-11-08 11:30:16 +01:00

added autocomplete to backend 2020-10-27 17:07:13 +01:00			`def autocomplete(ix, term, limit=10):`
			`with ix.reader() as reader:`
			`terms = []`
code cleanup 2020-11-21 14:03:45 +01:00			`for (score, t) in reader.most_distinctive_terms(`
			`"content", number=limit, prefix=term.lower()):`
added autocomplete to backend 2020-10-27 17:07:13 +01:00			`terms.append(t)`
			`return terms`