paperless-ngx/src/documents/matching.py

import logging
import re

from documents.models import Correspondent
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import Tag


logger = logging.getLogger("paperless.matching")


def log_reason(matching_model, document, reason):
    class_name = type(matching_model).__name__
    logger.debug(
        f"{class_name} {matching_model.name} matched on document "
        f"{document} because {reason}",
    )


def match_correspondents(document, classifier):
    if classifier:
        pred_id = classifier.predict_correspondent(document.content)
    else:
        pred_id = None

    correspondents = Correspondent.objects.all()

    return list(
        filter(lambda o: matches(o, document) or o.pk == pred_id, correspondents),
    )


def match_document_types(document, classifier):
    if classifier:
        pred_id = classifier.predict_document_type(document.content)
    else:
        pred_id = None

    document_types = DocumentType.objects.all()

    return list(
        filter(lambda o: matches(o, document) or o.pk == pred_id, document_types),
    )


def match_tags(document, classifier):
    if classifier:
        predicted_tag_ids = classifier.predict_tags(document.content)
    else:
        predicted_tag_ids = []

    tags = Tag.objects.all()

    return list(
        filter(lambda o: matches(o, document) or o.pk in predicted_tag_ids, tags),
    )


def matches(matching_model, document):
    search_kwargs = {}

    document_content = document.content

    # Check that match is not empty
    if matching_model.match.strip() == "":
        return False

    if matching_model.is_insensitive:
        search_kwargs = {"flags": re.IGNORECASE}

    if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
        for word in _split_match(matching_model):
            search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
            if not search_result:
                return False
        log_reason(
            matching_model,
            document,
            f"it contains all of these words: {matching_model.match}",
        )
        return True

    elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
        for word in _split_match(matching_model):
            if re.search(rf"\b{word}\b", document_content, **search_kwargs):
                log_reason(matching_model, document, f"it contains this word: {word}")
                return True
        return False

    elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
        result = bool(
            re.search(
                rf"\b{re.escape(matching_model.match)}\b",
                document_content,
                **search_kwargs,
            ),
        )
        if result:
            log_reason(
                matching_model,
                document,
                f'it contains this string: "{matching_model.match}"',
            )
        return result

    elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
        try:
            match = re.search(
                re.compile(matching_model.match, **search_kwargs),
                document_content,
            )
        except re.error:
            logger.error(
                f"Error while processing regular expression " f"{matching_model.match}",
            )
            return False
        if match:
            log_reason(
                matching_model,
                document,
                f"the string {match.group()} matches the regular expression "
                f"{matching_model.match}",
            )
        return bool(match)

    elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
        from fuzzywuzzy import fuzz

        match = re.sub(r"[^\w\s]", "", matching_model.match)
        text = re.sub(r"[^\w\s]", "", document_content)
        if matching_model.is_insensitive:
            match = match.lower()
            text = text.lower()
        if fuzz.partial_ratio(match, text) >= 90:
            # TODO: make this better
            log_reason(
                matching_model,
                document,
                f"parts of the document content somehow match the string "
                f"{matching_model.match}",
            )
            return True
        else:
            return False

    elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
        # this is done elsewhere.
        return False

    else:
        raise NotImplementedError("Unsupported matching algorithm")


def _split_match(matching_model):
    """
    Splits the match to individual keywords, getting rid of unnecessary
    spaces and grouping quoted words together.

    Example:
      '  some random  words "with   quotes  " and   spaces'
        ==>
      ["some", "random", "words", "with+quotes", "and", "spaces"]
    """
    findterms = re.compile(r'"([^"]+)"|(\S+)').findall
    normspace = re.compile(r"\s+").sub
    return [
        # normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
        re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
        for t in findterms(matching_model.match)
    ]
fixes #161 2021-01-13 17:17:23 +01:00			`import logging`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`import re`

Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`from documents.models import Correspondent`
			`from documents.models import DocumentType`
			`from documents.models import MatchingModel`
			`from documents.models import Tag`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00

rework most of the logging 2021-02-05 01:10:29 +01:00			`logger = logging.getLogger("paperless.matching")`
fixes #161 2021-01-13 17:17:23 +01:00

			`def log_reason(matching_model, document, reason):`
			`class_name = type(matching_model).__name__`
			`logger.debug(`
fix some logging messages 2021-02-06 15:30:47 +01:00			`f"{class_name} {matching_model.name} matched on document "`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`f"{document} because {reason}",`
Format Python code with black 2022-02-27 15:26:41 +01:00			`)`
fixes #161 2021-01-13 17:17:23 +01:00

			`def match_correspondents(document, classifier):`
code cleanup 2020-11-21 14:03:45 +01:00			`if classifier:`
fixes #161 2021-01-13 17:17:23 +01:00			`pred_id = classifier.predict_correspondent(document.content)`
code cleanup 2020-11-21 14:03:45 +01:00			`else:`
			`pred_id = None`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00
code cleanup 2020-11-21 14:03:45 +01:00			`correspondents = Correspondent.objects.all()`
code cleanup 2020-11-21 15:34:00 +01:00
Format Python code with black 2022-02-27 15:26:41 +01:00			`return list(`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`filter(lambda o: matches(o, document) or o.pk == pred_id, correspondents),`
Format Python code with black 2022-02-27 15:26:41 +01:00			`)`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00

fixes #161 2021-01-13 17:17:23 +01:00			`def match_document_types(document, classifier):`
code cleanup 2020-11-21 14:03:45 +01:00			`if classifier:`
fixes #161 2021-01-13 17:17:23 +01:00			`pred_id = classifier.predict_document_type(document.content)`
code cleanup 2020-11-21 14:03:45 +01:00			`else:`
			`pred_id = None`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00
code cleanup 2020-11-21 14:03:45 +01:00			`document_types = DocumentType.objects.all()`
code cleanup 2020-11-21 15:34:00 +01:00
Format Python code with black 2022-02-27 15:26:41 +01:00			`return list(`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`filter(lambda o: matches(o, document) or o.pk == pred_id, document_types),`
Format Python code with black 2022-02-27 15:26:41 +01:00			`)`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00

fixes #161 2021-01-13 17:17:23 +01:00			`def match_tags(document, classifier):`
code cleanup 2020-11-21 15:34:00 +01:00			`if classifier:`
fixes #161 2021-01-13 17:17:23 +01:00			`predicted_tag_ids = classifier.predict_tags(document.content)`
code cleanup 2020-11-21 15:34:00 +01:00			`else:`
			`predicted_tag_ids = []`

			`tags = Tag.objects.all()`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00
Format Python code with black 2022-02-27 15:26:41 +01:00			`return list(`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`filter(lambda o: matches(o, document) or o.pk in predicted_tag_ids, tags),`
Format Python code with black 2022-02-27 15:26:41 +01:00			`)`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00

fixes #161 2021-01-13 17:17:23 +01:00			`def matches(matching_model, document):`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`search_kwargs = {}`

removing transformation to lowercase since it is not needed at all. 2022-04-01 18:26:01 +02:00			`document_content = document.content`

unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`# Check that match is not empty`
			`if matching_model.match.strip() == "":`
			`return False`

			`if matching_model.is_insensitive:`
			`search_kwargs = {"flags": re.IGNORECASE}`

			`if matching_model.matching_algorithm == MatchingModel.MATCH_ALL:`
			`for word in _split_match(matching_model):`
Format Python code with black 2022-02-27 15:26:41 +01:00			`search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`if not search_result:`
			`return False`
fixes #161 2021-01-13 17:17:23 +01:00			`log_reason(`
Format Python code with black 2022-02-27 15:26:41 +01:00			`matching_model,`
			`document,`
			`f"it contains all of these words: {matching_model.match}",`
fixes #161 2021-01-13 17:17:23 +01:00			`)`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`return True`

code cleanup 2020-11-21 15:34:00 +01:00			`elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`for word in _split_match(matching_model):`
code cleanup 2020-11-21 15:34:00 +01:00			`if re.search(rf"\b{word}\b", document_content, **search_kwargs):`
Format Python code with black 2022-02-27 15:26:41 +01:00			`log_reason(matching_model, document, f"it contains this word: {word}")`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`return True`
			`return False`

code cleanup 2020-11-21 15:34:00 +01:00			`elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:`
Format Python code with black 2022-02-27 15:26:41 +01:00			`result = bool(`
			`re.search(`
			`rf"\b{re.escape(matching_model.match)}\b",`
			`document_content,`
			`**search_kwargs,`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`),`
Format Python code with black 2022-02-27 15:26:41 +01:00			`)`
fixes #161 2021-01-13 17:17:23 +01:00			`if result:`
			`log_reason(`
Format Python code with black 2022-02-27 15:26:41 +01:00			`matching_model,`
			`document,`
			`f'it contains this string: "{matching_model.match}"',`
fixes #161 2021-01-13 17:17:23 +01:00			`)`
			`return result`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00
code cleanup 2020-11-21 15:34:00 +01:00			`elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:`
fixes #161 2021-01-13 17:17:23 +01:00			`try:`
			`match = re.search(`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`re.compile(matching_model.match, **search_kwargs),`
			`document_content,`
fixes #161 2021-01-13 17:17:23 +01:00			`)`
			`except re.error:`
			`logger.error(`
Runs the pre-commit hooks over all the Python files 2022-03-11 10:55:51 -08:00			`f"Error while processing regular expression " f"{matching_model.match}",`
fixes #161 2021-01-13 17:17:23 +01:00			`)`
			`return False`
			`if match:`
			`log_reason(`
Format Python code with black 2022-02-27 15:26:41 +01:00			`matching_model,`
			`document,`
fixes #161 2021-01-13 17:17:23 +01:00			`f"the string {match.group()} matches the regular expression "`
Format Python code with black 2022-02-27 15:26:41 +01:00			`f"{matching_model.match}",`
fixes #161 2021-01-13 17:17:23 +01:00			`)`
			`return bool(match)`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00
code cleanup 2020-11-21 15:34:00 +01:00			`elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:`
lazy load fuzzy only when required 2021-02-05 01:11:03 +01:00			`from fuzzywuzzy import fuzz`

Format Python code with black 2022-02-27 15:26:41 +01:00			`match = re.sub(r"[^\w\s]", "", matching_model.match)`
			`text = re.sub(r"[^\w\s]", "", document_content)`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`if matching_model.is_insensitive:`
			`match = match.lower()`
			`text = text.lower()`
fixes #161 2021-01-13 17:17:23 +01:00			`if fuzz.partial_ratio(match, text) >= 90:`
			`# TODO: make this better`
			`log_reason(`
Format Python code with black 2022-02-27 15:26:41 +01:00			`matching_model,`
			`document,`
fixes #161 2021-01-13 17:17:23 +01:00			`f"parts of the document content somehow match the string "`
Format Python code with black 2022-02-27 15:26:41 +01:00			`f"{matching_model.match}",`
fixes #161 2021-01-13 17:17:23 +01:00			`)`
			`return True`
			`else:`
			`return False`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00
code cleanup 2020-11-21 15:34:00 +01:00			`elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`# this is done elsewhere.`
			`return False`

code cleanup 2020-11-21 15:34:00 +01:00			`else:`
			`raise NotImplementedError("Unsupported matching algorithm")`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00

			`def _split_match(matching_model):`
			`"""`
			`Splits the match to individual keywords, getting rid of unnecessary`
			`spaces and grouping quoted words together.`

			`Example:`
			`' some random words "with quotes " and spaces'`
			`==>`
			`["some", "random", "words", "with+quotes", "and", "spaces"]`
			`"""`
			`findterms = re.compile(r'"([^"]+)"\|(\S+)').findall`
			`normspace = re.compile(r"\s+").sub`
			`return [`
fixes #668 (see https://github.com/the-paperless-project/paperless/pull/571) 2021-03-17 22:44:18 +01:00			`# normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")`
Format Python code with black 2022-02-27 15:26:41 +01:00			`re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")`
unified document matching, legacy and automatching work alongside now 2020-10-28 11:45:11 +01:00			`for t in findterms(matching_model.match)`
			`]`