mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-13 10:07:15 +01:00
75 lines
2.1 KiB
Python
75 lines
2.1 KiB
Python
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from django.conf import settings
|
||
|
|
|
||
|
|
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||
|
|
|
||
|
|
|
||
|
|
class RemoteEngineConfig:
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
engine: str,
|
||
|
|
api_key: str | None = None,
|
||
|
|
endpoint: str | None = None,
|
||
|
|
):
|
||
|
|
self.engine = engine
|
||
|
|
self.api_key = api_key
|
||
|
|
self.endpoint = endpoint
|
||
|
|
|
||
|
|
def engine_is_valid(self):
|
||
|
|
valid = self.engine in ["azureai"] and self.api_key is not None
|
||
|
|
if self.engine == "azureai":
|
||
|
|
valid = valid and self.endpoint is not None
|
||
|
|
return valid
|
||
|
|
|
||
|
|
|
||
|
|
class RemoteDocumentParser(RasterisedDocumentParser):
|
||
|
|
"""
|
||
|
|
This parser uses a remote ocr engine to parse documents
|
||
|
|
"""
|
||
|
|
|
||
|
|
logging_name = "paperless.parsing.remote"
|
||
|
|
|
||
|
|
def get_settings(self) -> RemoteEngineConfig:
|
||
|
|
"""
|
||
|
|
This parser uses the OCR configuration settings to parse documents
|
||
|
|
"""
|
||
|
|
return RemoteEngineConfig(
|
||
|
|
engine=settings.REMOTE_OCR_ENGINE,
|
||
|
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||
|
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||
|
|
)
|
||
|
|
|
||
|
|
def supported_mime_types(self):
|
||
|
|
if self.settings.engine_is_valid():
|
||
|
|
return [
|
||
|
|
"application/pdf",
|
||
|
|
"image/png",
|
||
|
|
"image/jpeg",
|
||
|
|
"image/tiff",
|
||
|
|
"image/bmp",
|
||
|
|
"image/gif",
|
||
|
|
"image/webp",
|
||
|
|
]
|
||
|
|
else:
|
||
|
|
return []
|
||
|
|
|
||
|
|
def azure_ai_vision_parse(
|
||
|
|
self,
|
||
|
|
file: Path,
|
||
|
|
) -> str | None:
|
||
|
|
"""
|
||
|
|
This method uses the Azure AI Vision API to parse documents
|
||
|
|
"""
|
||
|
|
# TODO: Implement the Azure AI Vision API parsing logic
|
||
|
|
|
||
|
|
def parse(self, document_path: Path, mime_type, file_name=None):
|
||
|
|
if not self.settings.engine_is_valid():
|
||
|
|
self.log.warning(
|
||
|
|
"No valid remote parser engine is configured, content will be empty.",
|
||
|
|
)
|
||
|
|
self.text = ""
|
||
|
|
return
|
||
|
|
elif self.settings.engine == "azureai":
|
||
|
|
self.text = self.azure_ai_vision_parse(document_path)
|