paperless-ngx/src/documents/management/commands/document_fuzzy_match.py

129 lines
4.1 KiB
Python
Raw Normal View History

import dataclasses
import multiprocessing
from typing import Final
import rapidfuzz
import tqdm
from django.core.management import BaseCommand
from django.core.management import CommandError
from documents.models import Document
@dataclasses.dataclass(frozen=True)
class _WorkPackage:
first_doc: Document
second_doc: Document
@dataclasses.dataclass(frozen=True)
class _WorkResult:
doc_one_pk: int
doc_two_pk: int
ratio: float
def __lt__(self, other: "_WorkResult") -> bool:
return self.doc_one_pk < other.doc_one_pk
def _process_and_match(work: _WorkPackage) -> _WorkResult:
# Normalize the string some, lower case, whitespace, etc
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
# Basic matching ratio
match = rapidfuzz.fuzz.ratio(first_string, second_string)
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
class Command(BaseCommand):
help = "Searches for documents where the content almost matches"
def add_arguments(self, parser):
parser.add_argument(
"--ratio",
default=85.0,
type=float,
help="Ratio to consider documents a match",
)
parser.add_argument(
"--processes",
default=4,
type=int,
help="Number of processes to distribute work amongst",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0
opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = []
# Ratio is a float from 0.0 to 100.0
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100")
all_docs = Document.objects.all().order_by("id")
# Build work packages for processing
for first_doc in all_docs:
for second_doc in all_docs:
# doc to doc is obviously not useful
if first_doc.pk == second_doc.pk:
continue
# Skip matching which have already been matched together
# doc 1 to doc 2 is the same as doc 2 to doc 1
if (first_doc.pk, second_doc.pk) in checked_pairs or (
second_doc.pk,
first_doc.pk,
) in checked_pairs:
continue
checked_pairs.update(
[(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)],
)
work_pkgs.append(_WorkPackage(first_doc, second_doc))
# Don't spin up a pool of 1 process
if options["processes"] == 1:
results = []
for work in tqdm.tqdm(work_pkgs, disable=options["no_progress_bar"]):
results.append(_process_and_match(work))
else:
with multiprocessing.Pool(processes=options["processes"]) as pool:
results = list(
tqdm.tqdm(
pool.imap_unordered(_process_and_match, work_pkgs),
total=len(work_pkgs),
disable=options["no_progress_bar"],
),
)
# Check results
messages = []
for result in sorted(results):
if result.ratio >= opt_ratio:
messages.append(
self.style.NOTICE(
f"Document {result.doc_one_pk} fuzzy match"
f" to {result.doc_two_pk} (confidence {result.ratio:.3f})",
),
)
if len(messages) == 0:
messages.append(
self.style.SUCCESS("No matches found"),
)
self.stdout.writelines(
messages,
)