paperless-ngx/src/documents/management/commands/document_fuzzy_match.py

import dataclasses
import multiprocessing
from typing import Final

import rapidfuzz
import tqdm
from django.core.management import BaseCommand
from django.core.management import CommandError

from documents.models import Document


@dataclasses.dataclass(frozen=True)
class _WorkPackage:
    first_doc: Document
    second_doc: Document


@dataclasses.dataclass(frozen=True)
class _WorkResult:
    doc_one_pk: int
    doc_two_pk: int
    ratio: float

    def __lt__(self, other: "_WorkResult") -> bool:
        return self.doc_one_pk < other.doc_one_pk


def _process_and_match(work: _WorkPackage) -> _WorkResult:
    # Normalize the string some, lower case, whitespace, etc
    first_string = rapidfuzz.utils.default_process(work.first_doc.content)
    second_string = rapidfuzz.utils.default_process(work.second_doc.content)

    # Basic matching ratio
    match = rapidfuzz.fuzz.ratio(first_string, second_string)

    return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)


class Command(BaseCommand):
    help = "Searches for documents where the content almost matches"

    def add_arguments(self, parser):
        parser.add_argument(
            "--ratio",
            default=85.0,
            type=float,
            help="Ratio to consider documents a match",
        )
        parser.add_argument(
            "--processes",
            default=4,
            type=int,
            help="Number of processes to distribute work amongst",
        )
        parser.add_argument(
            "--no-progress-bar",
            default=False,
            action="store_true",
            help="If set, the progress bar will not be shown",
        )

    def handle(self, *args, **options):
        RATIO_MIN: Final[float] = 0.0
        RATIO_MAX: Final[float] = 100.0

        opt_ratio = options["ratio"]
        checked_pairs: set[tuple[int, int]] = set()
        work_pkgs: list[_WorkPackage] = []

        # Ratio is a float from 0.0 to 100.0
        if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
            raise CommandError("The ratio must be between 0 and 100")

        all_docs = Document.objects.all().order_by("id")

        # Build work packages for processing
        for first_doc in all_docs:
            for second_doc in all_docs:
                # doc to doc is obviously not useful
                if first_doc.pk == second_doc.pk:
                    continue
                # Skip matching which have already been matched together
                # doc 1 to doc 2 is the same as doc 2 to doc 1
                if (first_doc.pk, second_doc.pk) in checked_pairs or (
                    second_doc.pk,
                    first_doc.pk,
                ) in checked_pairs:
                    continue
                checked_pairs.update(
                    [(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)],
                )

                work_pkgs.append(_WorkPackage(first_doc, second_doc))

        # Don't spin up a pool of 1 process
        if options["processes"] == 1:
            results = []
            for work in tqdm.tqdm(work_pkgs, disable=options["no_progress_bar"]):
                results.append(_process_and_match(work))
        else:
            with multiprocessing.Pool(processes=options["processes"]) as pool:
                results = list(
                    tqdm.tqdm(
                        pool.imap_unordered(_process_and_match, work_pkgs),
                        total=len(work_pkgs),
                        disable=options["no_progress_bar"],
                    ),
                )

        # Check results
        messages = []
        for result in sorted(results):
            if result.ratio >= opt_ratio:
                messages.append(
                    self.style.NOTICE(
                        f"Document {result.doc_one_pk} fuzzy match"
                        f" to {result.doc_two_pk} (confidence {result.ratio:.3f})",
                    ),
                )

        if len(messages) == 0:
            messages.append(
                self.style.SUCCESS("No matches found"),
            )
        self.stdout.writelines(
            messages,
        )
Adds multiprocessing to the fuzzy matching for a speedup 2023-09-11 08:48:30 -07:00			`import dataclasses`
			`import multiprocessing`
Implements a new command for fuzzy matching document content and reporting potential duplicates 2023-09-10 16:32:10 -07:00			`from typing import Final`

			`import rapidfuzz`
Add progress bar to document_fuzzy_match 2023-09-10 21:34:40 -07:00			`import tqdm`
Implements a new command for fuzzy matching document content and reporting potential duplicates 2023-09-10 16:32:10 -07:00			`from django.core.management import BaseCommand`
			`from django.core.management import CommandError`

			`from documents.models import Document`


Adds multiprocessing to the fuzzy matching for a speedup 2023-09-11 08:48:30 -07:00			`@dataclasses.dataclass(frozen=True)`
			`class _WorkPackage:`
			`first_doc: Document`
			`second_doc: Document`


			`@dataclasses.dataclass(frozen=True)`
			`class _WorkResult:`
			`doc_one_pk: int`
			`doc_two_pk: int`
			`ratio: float`

			`def __lt__(self, other: "_WorkResult") -> bool:`
			`return self.doc_one_pk < other.doc_one_pk`


			`def _process_and_match(work: _WorkPackage) -> _WorkResult:`
			`# Normalize the string some, lower case, whitespace, etc`
			`first_string = rapidfuzz.utils.default_process(work.first_doc.content)`
			`second_string = rapidfuzz.utils.default_process(work.second_doc.content)`

			`# Basic matching ratio`
			`match = rapidfuzz.fuzz.ratio(first_string, second_string)`

			`return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)`


Implements a new command for fuzzy matching document content and reporting potential duplicates 2023-09-10 16:32:10 -07:00			`class Command(BaseCommand):`
Adds multiprocessing to the fuzzy matching for a speedup 2023-09-11 08:48:30 -07:00			`help = "Searches for documents where the content almost matches"`
Implements a new command for fuzzy matching document content and reporting potential duplicates 2023-09-10 16:32:10 -07:00
			`def add_arguments(self, parser):`
			`parser.add_argument(`
			`"--ratio",`
			`default=85.0,`
			`type=float,`
			`help="Ratio to consider documents a match",`
			`)`
Adds multiprocessing to the fuzzy matching for a speedup 2023-09-11 08:48:30 -07:00			`parser.add_argument(`
			`"--processes",`
			`default=4,`
			`type=int,`
			`help="Number of processes to distribute work amongst",`
			`)`
Add progress bar to document_fuzzy_match 2023-09-10 21:34:40 -07:00			`parser.add_argument(`
			`"--no-progress-bar",`
			`default=False,`
			`action="store_true",`
			`help="If set, the progress bar will not be shown",`
			`)`
Implements a new command for fuzzy matching document content and reporting potential duplicates 2023-09-10 16:32:10 -07:00
			`def handle(self, args, *options):`
			`RATIO_MIN: Final[float] = 0.0`
			`RATIO_MAX: Final[float] = 100.0`

			`opt_ratio = options["ratio"]`
Adds multiprocessing to the fuzzy matching for a speedup 2023-09-11 08:48:30 -07:00			`checked_pairs: set[tuple[int, int]] = set()`
			`work_pkgs: list[_WorkPackage] = []`
Implements a new command for fuzzy matching document content and reporting potential duplicates 2023-09-10 16:32:10 -07:00
			`# Ratio is a float from 0.0 to 100.0`
			`if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:`
			`raise CommandError("The ratio must be between 0 and 100")`

			`all_docs = Document.objects.all().order_by("id")`

Adds multiprocessing to the fuzzy matching for a speedup 2023-09-11 08:48:30 -07:00			`# Build work packages for processing`
			`for first_doc in all_docs:`
Implements a new command for fuzzy matching document content and reporting potential duplicates 2023-09-10 16:32:10 -07:00			`for second_doc in all_docs:`
Adds multiprocessing to the fuzzy matching for a speedup 2023-09-11 08:48:30 -07:00			`# doc to doc is obviously not useful`
Implements a new command for fuzzy matching document content and reporting potential duplicates 2023-09-10 16:32:10 -07:00			`if first_doc.pk == second_doc.pk:`
			`continue`
Adds multiprocessing to the fuzzy matching for a speedup 2023-09-11 08:48:30 -07:00			`# Skip matching which have already been matched together`
			`# doc 1 to doc 2 is the same as doc 2 to doc 1`
			`if (first_doc.pk, second_doc.pk) in checked_pairs or (`
			`second_doc.pk,`
			`first_doc.pk,`
			`) in checked_pairs:`
			`continue`
			`checked_pairs.update(`
			`[(first_doc.pk, second_doc.pk), (second_doc.pk, first_doc.pk)],`
			`)`

			`work_pkgs.append(_WorkPackage(first_doc, second_doc))`

			`# Don't spin up a pool of 1 process`
			`if options["processes"] == 1:`
			`results = []`
			`for work in tqdm.tqdm(work_pkgs, disable=options["no_progress_bar"]):`
			`results.append(_process_and_match(work))`
			`else:`
			`with multiprocessing.Pool(processes=options["processes"]) as pool:`
			`results = list(`
			`tqdm.tqdm(`
			`pool.imap_unordered(_process_and_match, work_pkgs),`
			`total=len(work_pkgs),`
			`disable=options["no_progress_bar"],`
			`),`
			`)`

			`# Check results`
			`messages = []`
			`for result in sorted(results):`
			`if result.ratio >= opt_ratio:`
			`messages.append(`
			`self.style.NOTICE(`
			`f"Document {result.doc_one_pk} fuzzy match"`
			`f" to {result.doc_two_pk} (confidence {result.ratio:.3f})",`
			`),`
			`)`
Add progress bar to document_fuzzy_match 2023-09-10 21:34:40 -07:00
			`if len(messages) == 0:`
			`messages.append(`
Adds multiprocessing to the fuzzy matching for a speedup 2023-09-11 08:48:30 -07:00			`self.style.SUCCESS("No matches found"),`
Add progress bar to document_fuzzy_match 2023-09-10 21:34:40 -07:00			`)`
			`self.stdout.writelines(`
			`messages,`
			`)`