paperless-ngx/src/documents/matching.py

232 lines
7 KiB
Python
Raw Normal View History

2021-01-13 17:17:23 +01:00
import logging
import re
from documents.models import Correspondent
from documents.models import DocumentType
from documents.models import MatchingModel
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
from documents.models import StoragePath
from documents.models import Tag
2023-04-15 22:47:36 -07:00
from documents.permissions import get_objects_for_user_owner_aware
2021-02-05 01:10:29 +01:00
logger = logging.getLogger("paperless.matching")
2021-01-13 17:17:23 +01:00
def log_reason(matching_model, document, reason):
class_name = type(matching_model).__name__
logger.debug(
2021-02-06 15:30:47 +01:00
f"{class_name} {matching_model.name} matched on document "
f"{document} because {reason}",
2022-02-27 15:26:41 +01:00
)
2021-01-13 17:17:23 +01:00
2023-04-15 22:47:36 -07:00
def match_correspondents(document, classifier, user=None):
pred_id = classifier.predict_correspondent(document.content) if classifier else None
2023-04-26 09:37:43 -07:00
if user is None and document.owner is not None:
user = document.owner
2023-04-15 22:47:36 -07:00
if user is not None:
correspondents = get_objects_for_user_owner_aware(
user,
"documents.view_correspondent",
Correspondent,
)
else:
correspondents = Correspondent.objects.all()
2020-11-21 15:34:00 +01:00
2022-02-27 15:26:41 +01:00
return list(
filter(
lambda o: matches(o, document)
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
correspondents,
),
2022-02-27 15:26:41 +01:00
)
2023-04-15 22:47:36 -07:00
def match_document_types(document, classifier, user=None):
pred_id = classifier.predict_document_type(document.content) if classifier else None
2023-04-26 09:37:43 -07:00
if user is None and document.owner is not None:
user = document.owner
2023-04-15 22:47:36 -07:00
if user is not None:
document_types = get_objects_for_user_owner_aware(
user,
"documents.view_documenttype",
DocumentType,
)
else:
document_types = DocumentType.objects.all()
2020-11-21 15:34:00 +01:00
2022-02-27 15:26:41 +01:00
return list(
filter(
lambda o: matches(o, document)
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
document_types,
),
2022-02-27 15:26:41 +01:00
)
2023-04-15 22:47:36 -07:00
def match_tags(document, classifier, user=None):
predicted_tag_ids = classifier.predict_tags(document.content) if classifier else []
2020-11-21 15:34:00 +01:00
2023-04-26 09:37:43 -07:00
if user is None and document.owner is not None:
user = document.owner
2023-04-15 22:47:36 -07:00
if user is not None:
tags = get_objects_for_user_owner_aware(user, "documents.view_tag", Tag)
else:
tags = Tag.objects.all()
2022-02-27 15:26:41 +01:00
return list(
filter(
lambda o: matches(o, document)
or (
o.matching_algorithm == MatchingModel.MATCH_AUTO
and o.pk in predicted_tag_ids
),
tags,
),
2022-02-27 15:26:41 +01:00
)
2023-04-15 22:47:36 -07:00
def match_storage_paths(document, classifier, user=None):
pred_id = classifier.predict_storage_path(document.content) if classifier else None
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
2023-04-26 09:37:43 -07:00
if user is None and document.owner is not None:
user = document.owner
2023-04-15 22:47:36 -07:00
if user is not None:
storage_paths = get_objects_for_user_owner_aware(
user,
"documents.view_storagepath",
StoragePath,
)
else:
storage_paths = StoragePath.objects.all()
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
return list(
filter(
lambda o: matches(o, document)
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
Feature: Dynamic document storage pathes (#916) * Added devcontainer * Add feature storage pathes * Exclude tests and add versioning * Check escaping * Check escaping * Check quoting * Echo * Escape * Escape : * Double escape \ * Escaping * Remove if * Escape colon * Missing \ * Esacpe : * Escape all * test * Remove sed * Fix exclude * Remove SED command * Add LD_LIBRARY_PATH * Adjusted to v1.7 * Updated test-cases * Remove devcontainer * Removed internal build-file * Run pre-commit * Corrected flak8 error * Adjusted to v1.7 * Updated test-cases * Corrected flak8 error * Adjusted to new plural translations * Small adjustments due to code-review backend * Adjusted line-break * Removed PAPERLESS prefix from settings variables * Corrected style change due to search+replace * First documentation draft * Revert changes to Pipfile * Add sphinx-autobuild with keep-outdated * Revert merge error that results in wrong storage path is evaluated * Adjust styles of generated files ... * Adds additional testing to cover dynamic storage path functionality * Remove unnecessary condition * Add hint to edit storage path dialog * Correct spelling of pathes to paths * Minor documentation tweaks * Minor typo * improving wrapping of filter editor buttons with new storage path button * Update .gitignore * Fix select border radius in non input-groups * Better storage path edit hint * Add note to edit storage path dialog re document_renamer * Add note to bulk edit storage path re document_renamer * Rename FILTER_STORAGE_DIRECTORY to PATH * Fix broken filter rule parsing * Show default storage if unspecified * Remove note re storage path on bulk edit * Add basic validation of filename variables Co-authored-by: Markus Kling <markus@markus-kling.net> Co-authored-by: Trenton Holmes <holmes.trenton@gmail.com> Co-authored-by: Michael Shamoon <4887959+shamoon@users.noreply.github.com> Co-authored-by: Quinn Casey <quinn@quinncasey.com>
2022-05-19 23:42:25 +02:00
storage_paths,
),
)
2021-01-13 17:17:23 +01:00
def matches(matching_model, document):
search_kwargs = {}
document_content = document.content
# Check that match is not empty
if not matching_model.match.strip():
return False
if matching_model.is_insensitive:
search_kwargs = {"flags": re.IGNORECASE}
2023-02-21 20:01:30 -05:00
if matching_model.matching_algorithm == MatchingModel.MATCH_NONE:
return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
for word in _split_match(matching_model):
2022-02-27 15:26:41 +01:00
search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
if not search_result:
return False
2021-01-13 17:17:23 +01:00
log_reason(
2022-02-27 15:26:41 +01:00
matching_model,
document,
f"it contains all of these words: {matching_model.match}",
2021-01-13 17:17:23 +01:00
)
return True
2020-11-21 15:34:00 +01:00
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
for word in _split_match(matching_model):
2020-11-21 15:34:00 +01:00
if re.search(rf"\b{word}\b", document_content, **search_kwargs):
2022-02-27 15:26:41 +01:00
log_reason(matching_model, document, f"it contains this word: {word}")
return True
return False
2020-11-21 15:34:00 +01:00
elif matching_model.matching_algorithm == MatchingModel.MATCH_LITERAL:
2022-02-27 15:26:41 +01:00
result = bool(
re.search(
rf"\b{re.escape(matching_model.match)}\b",
document_content,
**search_kwargs,
),
2022-02-27 15:26:41 +01:00
)
2021-01-13 17:17:23 +01:00
if result:
log_reason(
2022-02-27 15:26:41 +01:00
matching_model,
document,
f'it contains this string: "{matching_model.match}"',
2021-01-13 17:17:23 +01:00
)
return result
2020-11-21 15:34:00 +01:00
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
2021-01-13 17:17:23 +01:00
try:
match = re.search(
re.compile(matching_model.match, **search_kwargs),
document_content,
2021-01-13 17:17:23 +01:00
)
except re.error:
logger.error(
f"Error while processing regular expression {matching_model.match}",
2021-01-13 17:17:23 +01:00
)
return False
if match:
log_reason(
2022-02-27 15:26:41 +01:00
matching_model,
document,
2021-01-13 17:17:23 +01:00
f"the string {match.group()} matches the regular expression "
2022-02-27 15:26:41 +01:00
f"{matching_model.match}",
2021-01-13 17:17:23 +01:00
)
return bool(match)
2020-11-21 15:34:00 +01:00
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
2022-10-31 20:42:44 +01:00
from rapidfuzz import fuzz
2021-02-05 01:11:03 +01:00
2022-02-27 15:26:41 +01:00
match = re.sub(r"[^\w\s]", "", matching_model.match)
text = re.sub(r"[^\w\s]", "", document_content)
if matching_model.is_insensitive:
match = match.lower()
text = text.lower()
2022-10-31 20:42:44 +01:00
if fuzz.partial_ratio(match, text, score_cutoff=90):
2021-01-13 17:17:23 +01:00
# TODO: make this better
log_reason(
2022-02-27 15:26:41 +01:00
matching_model,
document,
2021-01-13 17:17:23 +01:00
f"parts of the document content somehow match the string "
2022-02-27 15:26:41 +01:00
f"{matching_model.match}",
2021-01-13 17:17:23 +01:00
)
return True
else:
return False
2020-11-21 15:34:00 +01:00
elif matching_model.matching_algorithm == MatchingModel.MATCH_AUTO:
# this is done elsewhere.
return False
2020-11-21 15:34:00 +01:00
else:
raise NotImplementedError("Unsupported matching algorithm")
def _split_match(matching_model):
"""
Splits the match to individual keywords, getting rid of unnecessary
spaces and grouping quoted words together.
Example:
' some random words "with quotes " and spaces'
==>
["some", "random", "words", "with+quotes", "and", "spaces"]
"""
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
normspace = re.compile(r"\s+").sub
return [
# normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
2022-02-27 15:26:41 +01:00
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
for t in findterms(matching_model.match)
]