2023-09-11 08:48:30 -07:00
|
|
|
import dataclasses
|
|
|
|
|
import multiprocessing
|
2023-09-10 16:32:10 -07:00
|
|
|
from typing import Final
|
|
|
|
|
|
|
|
|
|
import rapidfuzz
|
2023-09-10 21:34:40 -07:00
|
|
|
import tqdm
|
2023-09-10 16:32:10 -07:00
|
|
|
from django.core.management import BaseCommand
|
|
|
|
|
from django.core.management import CommandError
|
|
|
|
|
|
|
|
|
|
from documents.models import Document
|
|
|
|
|
|
|
|
|
|
|
2023-09-11 08:48:30 -07:00
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
|
|
|
class _WorkPackage:
|
|
|
|
|
first_doc: Document
|
|
|
|
|
second_doc: Document
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclasses.dataclass(frozen=True)
|
|
|
|
|
class _WorkResult:
|
|
|
|
|
doc_one_pk: int
|
|
|
|
|
doc_two_pk: int
|
|
|
|
|
ratio: float
|
|
|
|
|
|
|
|
|
|
def __lt__(self, other: "_WorkResult") -> bool:
|
|
|
|
|
return self.doc_one_pk < other.doc_one_pk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _process_and_match(work: _WorkPackage) -> _WorkResult:
|
|
|
|
|
# Normalize the string some, lower case, whitespace, etc
|
|
|
|
|
first_string = rapidfuzz.utils.default_process(work.first_doc.content)
|
|
|
|
|
second_string = rapidfuzz.utils.default_process(work.second_doc.content)
|
|
|
|
|
|
|
|
|
|
# Basic matching ratio
|
|
|
|
|
match = rapidfuzz.fuzz.ratio(first_string, second_string)
|
|
|
|
|
|
|
|
|
|
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
|
|
|
|
|
|
|
|
|
|
|
2023-09-10 16:32:10 -07:00
|
|
|
class Command(BaseCommand):
|
2023-09-11 08:48:30 -07:00
|
|
|
help = "Searches for documents where the content almost matches"
|
2023-09-10 16:32:10 -07:00
|
|
|
|
|
|
|
|
def add_arguments(self, parser):
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
"--ratio",
|
|
|
|
|
default=85.0,
|
|
|
|
|
type=float,
|
|
|
|
|
help="Ratio to consider documents a match",
|
|
|
|
|
)
|
2023-09-11 08:48:30 -07:00
|
|
|
parser.add_argument(
|
|
|
|
|
"--processes",
|
|
|
|
|
default=4,
|
|
|
|
|
type=int,
|
|
|
|
|
help="Number of processes to distribute work amongst",
|
|
|
|
|
)
|
2023-09-10 21:34:40 -07:00
|
|
|
parser.add_argument(
|
|
|
|
|
"--no-progress-bar",
|
|
|
|
|
default=False,
|
|
|
|
|
action="store_true",
|
|
|
|
|
help="If set, the progress bar will not be shown",
|
|
|
|
|
)
|
2023-09-10 16:32:10 -07:00
|
|
|
|
|
|
|
|
def handle(self, *args, **options):
|
|
|
|
|
RATIO_MIN: Final[float] = 0.0
|
|
|
|
|
RATIO_MAX: Final[float] = 100.0
|
|
|
|
|
|
|
|
|
|
opt_ratio = options["ratio"]
|
2023-09-11 08:48:30 -07:00
|
|
|
checked_pairs: set[tuple[int, int]] = set()
|
|
|
|
|
work_pkgs: list[_WorkPackage] = []
|
2023-09-10 16:32:10 -07:00
|
|
|
|
|
|
|
|
# Ratio is a float from 0.0 to 100.0
|
|
|
|
|
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
|
|
|
|
|
raise CommandError("The ratio must be between 0 and 100")
|
|
|
|
|
|
|
|
|
|
all_docs = Document.objects.all().order_by("id")
|
|
|
|
|
|
2023-09-11 08:48:30 -07:00
|
|
|
# Build work packages for processing
|
|
|
|
|
for first_doc in all_docs:
|
2023-09-10 16:32:10 -07:00
|
|
|
for second_doc in all_docs:
|
2023-09-11 08:48:30 -07:00
|
|
|
# doc to doc is obviously not useful
|
2023-09-10 16:32:10 -07:00
|
|
|
if first_doc.pk == second_doc.pk:
|
|
|
|
|
continue
|
2023-09-11 08:48:30 -07:00
|
|
|
# Skip matching which have already been matched together
|
|
|
|
|
# doc 1 to doc 2 is the same as doc 2 to doc 1
|
|
|
|
|
if (first_doc.pk, second_doc.pk) in checked_pairs or (
|
|
|
|
|
second_doc.pk,
|
|
|
|
|
first_doc.pk,
|
|
|
|
|
) in checked_pairs:
|
|
|
|
|
continue
|
|
|
|
|
checked_pairs.update(
|
|
|
|
|
[(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)],
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
work_pkgs.append(_WorkPackage(first_doc, second_doc))
|
|
|
|
|
|
|
|
|
|
# Don't spin up a pool of 1 process
|
|
|
|
|
if options["processes"] == 1:
|
|
|
|
|
results = []
|
|
|
|
|
for work in tqdm.tqdm(work_pkgs, disable=options["no_progress_bar"]):
|
|
|
|
|
results.append(_process_and_match(work))
|
|
|
|
|
else:
|
|
|
|
|
with multiprocessing.Pool(processes=options["processes"]) as pool:
|
|
|
|
|
results = list(
|
|
|
|
|
tqdm.tqdm(
|
|
|
|
|
pool.imap_unordered(_process_and_match, work_pkgs),
|
|
|
|
|
total=len(work_pkgs),
|
|
|
|
|
disable=options["no_progress_bar"],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# Check results
|
|
|
|
|
messages = []
|
|
|
|
|
for result in sorted(results):
|
|
|
|
|
if result.ratio >= opt_ratio:
|
|
|
|
|
messages.append(
|
|
|
|
|
self.style.NOTICE(
|
|
|
|
|
f"Document {result.doc_one_pk} fuzzy match"
|
|
|
|
|
f" to {result.doc_two_pk} (confidence {result.ratio:.3f})",
|
|
|
|
|
),
|
|
|
|
|
)
|
2023-09-10 21:34:40 -07:00
|
|
|
|
|
|
|
|
if len(messages) == 0:
|
|
|
|
|
messages.append(
|
2023-09-11 08:48:30 -07:00
|
|
|
self.style.SUCCESS("No matches found"),
|
2023-09-10 21:34:40 -07:00
|
|
|
)
|
|
|
|
|
self.stdout.writelines(
|
|
|
|
|
messages,
|
|
|
|
|
)
|