Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

2025-12-19 21:16:56 +01:00 · 2023-02-23 22:42:57 -05:00 · 2023-02-23 22:42:57 -05:00 · ca412e0184
commit ca412e0184
parent 8a89f5ae27
8 changed files with 185 additions and 14 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser):

        # If the original has text, and the user doesn't want an archive,
        # we're done here
-        if settings.OCR_MODE == "skip_noarchive" and original_has_text:
+        skip_archive_for_text = (
+            settings.OCR_MODE == "skip_noarchive"
+            or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
+        )
+        if skip_archive_for_text and original_has_text:
            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
            return
@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser):
            self.log("debug", f"Calling OCRmyPDF with args: {args}")
            ocrmypdf.ocr(**args)

-            self.archive_path = archive_path
+            if settings.OCR_SKIP_ARCHIVE_FILE != "always":
+                self.archive_path = archive_path

            self.text = self.extract_text(sidecar_file, archive_path)