paperless-ngx/src/paperless/utils.py

import logging

from dateparser.languages.loader import LocaleDataLoader

logger = logging.getLogger("paperless.utils")

OCR_TO_DATEPARSER_LANGUAGES = {
    """
    Translation map from languages supported by Tesseract OCR
    to languages supported by dateparser.
    To add a language, make sure it is supported by both libraries.
    The ISO 639-2 will help you link a 3-char to 2-char language code.
    Links:
    - Tesseract languages: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
    - Python dateparser languages: https://dateparser.readthedocs.io/en/latest/supported_locales.html
    - ISO 639-2: https://www.loc.gov/standards/iso639-2/php/code_list.php
    """
    # TODO check these Dateparser languages as they are not referenced on the ISO639-2 standard,
    # so we didn't find the equivalent in Tesseract:
    # agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,
    # ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,
    # rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue
    "afr": "af",
    "amh": "am",
    "ara": "ar",
    "asm": "as",
    "ast": "ast",
    "aze": "az",
    "bel": "be",
    "bul": "bg",
    "ben": "bn",
    "bod": "bo",
    "bre": "br",
    "bos": "bs",
    "cat": "ca",
    "cher": "chr",
    "ces": "cs",
    "cym": "cy",
    "dan": "da",
    "deu": "de",
    "dzo": "dz",
    "ell": "el",
    "eng": "en",
    "epo": "eo",
    "spa": "es",
    "est": "et",
    "eus": "eu",
    "fas": "fa",
    "fin": "fi",
    "fil": "fil",
    "fao": "fo",  # codespell:ignore
    "fra": "fr",
    "fry": "fy",
    "gle": "ga",
    "gla": "gd",
    "glg": "gl",
    "guj": "gu",
    "heb": "he",
    "hin": "hi",
    "hrv": "hr",
    "hun": "hu",
    "hye": "hy",
    "ind": "id",
    "isl": "is",
    "ita": "it",
    "jpn": "ja",
    "kat": "ka",
    "kaz": "kk",
    "khm": "km",
    "knda": "kn",
    "kor": "ko",
    "kir": "ky",
    "ltz": "lb",
    "lao": "lo",
    "lit": "lt",
    "lav": "lv",
    "mal": "ml",
    "mon": "mn",
    "mar": "mr",
    "msa": "ms",
    "mlt": "mt",
    "mya": "my",
    "nep": "ne",
    "nld": "nl",
    "ori": "or",
    "pan": "pa",
    "pol": "pl",
    "pus": "ps",
    "por": "pt",
    "que": "qu",
    "ron": "ro",
    "rus": "ru",
    "sin": "si",
    "slk": "sk",
    "slv": "sl",
    "sqi": "sq",
    "srp": "sr",
    "swe": "sv",
    "swa": "sw",
    "tam": "ta",
    "tel": "te",  # codespell:ignore
    "tha": "th",  # codespell:ignore
    "tir": "ti",
    "tgl": "tl",
    "ton": "to",
    "tur": "tr",
    "uig": "ug",
    "ukr": "uk",
    "urd": "ur",
    "uzb": "uz",
    "via": "vi",
    "yid": "yi",
    "yor": "yo",
    "chi": "zh",
}


def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:
    """
    Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")
    into a list of locales compatible with the `dateparser` library.

    - If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").
    Falls back to the base language (e.g., "az") if needed.
    - If a language cannot be mapped or validated, it is skipped with a warning.
    - Returns a list of valid locales, or an empty list if none could be converted.
    """
    loader = LocaleDataLoader()
    result = []
    try:
        for ocr_language in ocr_languages.split("+"):
            # Split into language and optional script
            ocr_lang_part, *script = ocr_language.split("_")
            ocr_script_part = script[0] if script else None

            language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part)
            if language_part is None:
                logger.debug(
                    f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',
                )
                continue

            # Ensure base language is supported by dateparser
            loader.get_locale_map(locales=[language_part])

            # Try to add the script part if it's supported by dateparser
            if ocr_script_part:
                dateparser_language = f"{language_part}-{ocr_script_part.title()}"
                try:
                    loader.get_locale_map(locales=[dateparser_language])
                except Exception:
                    logger.info(
                        f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",
                    )
                    dateparser_language = language_part
            else:
                dateparser_language = language_part
            if dateparser_language not in result:
                result.append(dateparser_language)
    except Exception as e:
        logger.warning(
            f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",
        )
        return []
    if not result:
        logger.info(
            "Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",
        )
    return result
Fix: include application config language settings for dateparser auto-detection (#10722) 2025-08-31 15:22:39 -07:00			`import logging`

			`from dateparser.languages.loader import LocaleDataLoader`

			`logger = logging.getLogger("paperless.utils")`

			`OCR_TO_DATEPARSER_LANGUAGES = {`
Performance: Add support for configuring date parser languages (#10181) --------- Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com> 2025-07-01 07:57:38 +02:00			`"""`
			`Translation map from languages supported by Tesseract OCR`
			`to languages supported by dateparser.`
			`To add a language, make sure it is supported by both libraries.`
			`The ISO 639-2 will help you link a 3-char to 2-char language code.`
			`Links:`
			`- Tesseract languages: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html`
			`- Python dateparser languages: https://dateparser.readthedocs.io/en/latest/supported_locales.html`
			`- ISO 639-2: https://www.loc.gov/standards/iso639-2/php/code_list.php`
			`"""`
			`# TODO check these Dateparser languages as they are not referenced on the ISO639-2 standard,`
			`# so we didn't find the equivalent in Tesseract:`
			`# agq, asa, bez, brx, cgg, ckb, dav, dje, dyo, ebu, guz, jgo, jmc, kde, kea, khq, kln,`
			`# ksb, ksf, ksh, lag, lkt, lrc, luy, mer, mfe, mgh, mgo, mua, mzn, naq, nmg, nnh, nus,`
			`# rof, rwk, saq, sbp, she, ses, shi, teo, twq, tzm, vun, wae, xog, yav, yue`
Fix: include application config language settings for dateparser auto-detection (#10722) 2025-08-31 15:22:39 -07:00			`"afr": "af",`
			`"amh": "am",`
			`"ara": "ar",`
			`"asm": "as",`
			`"ast": "ast",`
			`"aze": "az",`
			`"bel": "be",`
			`"bul": "bg",`
			`"ben": "bn",`
			`"bod": "bo",`
			`"bre": "br",`
			`"bos": "bs",`
			`"cat": "ca",`
			`"cher": "chr",`
			`"ces": "cs",`
			`"cym": "cy",`
			`"dan": "da",`
			`"deu": "de",`
			`"dzo": "dz",`
			`"ell": "el",`
			`"eng": "en",`
			`"epo": "eo",`
			`"spa": "es",`
			`"est": "et",`
			`"eus": "eu",`
			`"fas": "fa",`
			`"fin": "fi",`
			`"fil": "fil",`
			`"fao": "fo", # codespell:ignore`
			`"fra": "fr",`
			`"fry": "fy",`
			`"gle": "ga",`
			`"gla": "gd",`
			`"glg": "gl",`
			`"guj": "gu",`
			`"heb": "he",`
			`"hin": "hi",`
			`"hrv": "hr",`
			`"hun": "hu",`
			`"hye": "hy",`
			`"ind": "id",`
			`"isl": "is",`
			`"ita": "it",`
			`"jpn": "ja",`
			`"kat": "ka",`
			`"kaz": "kk",`
			`"khm": "km",`
			`"knda": "kn",`
			`"kor": "ko",`
			`"kir": "ky",`
			`"ltz": "lb",`
			`"lao": "lo",`
			`"lit": "lt",`
			`"lav": "lv",`
			`"mal": "ml",`
			`"mon": "mn",`
			`"mar": "mr",`
			`"msa": "ms",`
			`"mlt": "mt",`
			`"mya": "my",`
			`"nep": "ne",`
			`"nld": "nl",`
			`"ori": "or",`
			`"pan": "pa",`
			`"pol": "pl",`
			`"pus": "ps",`
			`"por": "pt",`
			`"que": "qu",`
			`"ron": "ro",`
			`"rus": "ru",`
			`"sin": "si",`
			`"slk": "sk",`
			`"slv": "sl",`
			`"sqi": "sq",`
			`"srp": "sr",`
			`"swe": "sv",`
			`"swa": "sw",`
			`"tam": "ta",`
			`"tel": "te", # codespell:ignore`
			`"tha": "th", # codespell:ignore`
			`"tir": "ti",`
			`"tgl": "tl",`
			`"ton": "to",`
			`"tur": "tr",`
			`"uig": "ug",`
			`"ukr": "uk",`
			`"urd": "ur",`
			`"uzb": "uz",`
			`"via": "vi",`
			`"yid": "yi",`
			`"yor": "yo",`
			`"chi": "zh",`
			`}`


			`def ocr_to_dateparser_languages(ocr_languages: str) -> list[str]:`
			`"""`
			`Convert Tesseract OCR_LANGUAGE codes (ISO 639-2, e.g. "eng+fra", with optional scripts like "aze_Cyrl")`
			into a list of locales compatible with the `dateparser` library.

			`- If a script is provided (e.g., "aze_Cyrl"), attempts to use the full locale (e.g., "az-Cyrl").`
			`Falls back to the base language (e.g., "az") if needed.`
			`- If a language cannot be mapped or validated, it is skipped with a warning.`
			`- Returns a list of valid locales, or an empty list if none could be converted.`
			`"""`
			`loader = LocaleDataLoader()`
			`result = []`
			`try:`
			`for ocr_language in ocr_languages.split("+"):`
			`# Split into language and optional script`
			`ocr_lang_part, *script = ocr_language.split("_")`
			`ocr_script_part = script[0] if script else None`

			`language_part = OCR_TO_DATEPARSER_LANGUAGES.get(ocr_lang_part)`
			`if language_part is None:`
			`logger.debug(`
			`f'Unable to map OCR language "{ocr_lang_part}" to dateparser locale. ',`
			`)`
			`continue`

			`# Ensure base language is supported by dateparser`
			`loader.get_locale_map(locales=[language_part])`

			`# Try to add the script part if it's supported by dateparser`
			`if ocr_script_part:`
			`dateparser_language = f"{language_part}-{ocr_script_part.title()}"`
			`try:`
			`loader.get_locale_map(locales=[dateparser_language])`
			`except Exception:`
			`logger.info(`
			`f"Language variant '{dateparser_language}' not supported by dateparser; falling back to base language '{language_part}'. You can manually set PAPERLESS_DATE_PARSER_LANGUAGES if needed.",`
			`)`
			`dateparser_language = language_part`
			`else:`
			`dateparser_language = language_part`
			`if dateparser_language not in result:`
			`result.append(dateparser_language)`
			`except Exception as e:`
			`logger.warning(`
			`f"Error auto-configuring dateparser languages. Set PAPERLESS_DATE_PARSER_LANGUAGES parameter to avoid this. Detail: {e}",`
			`)`
			`return []`
			`if not result:`
			`logger.info(`
			`"Unable to automatically determine dateparser languages from OCR_LANGUAGE, falling back to multi-language support.",`
			`)`
			`return result`