mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-11 00:57:09 +01:00
feat: refactor for pluggable consumers
I've broken out the OCR-specific code from the consumers and dumped it
all into its own app, `paperless_tesseract`. This new app should serve
as a sample of how to create one's own consumer for different file
types.
Documentation for how to do this isn't ready yet, but for the impatient:
* Create a new app
* containing a `parsers.py` for your parser modelled after
`paperless_tesseract.parsers.RasterisedDocumentParser`
* containing a `signals.py` with a handler moddelled after
`paperless_tesseract.signals.ConsumerDeclaration`
* connect the signal handler to
`documents.signals.document_consumer_declaration` in
`your_app.apps`
* Install the app into Paperless by declaring
`PAPERLESS_INSTALLED_APPS=your_app`. Additional apps should be
separated with commas.
* Restart the consumer
This commit is contained in:
parent
0f7bfc547a
commit
55e81ca4bb
14 changed files with 429 additions and 286 deletions
45
src/documents/parsers.py
Normal file
45
src/documents/parsers.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class DocumentParser(object):
|
||||
"""
|
||||
Subclass this to make your own parser. Have a look at
|
||||
`paperless_tesseract.parsers` for inspiration.
|
||||
"""
|
||||
|
||||
SCRATCH = settings.SCRATCH_DIR
|
||||
|
||||
def __init__(self, path):
|
||||
self.document_path = path
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless", dir=self.SCRATCH)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logging_group = None
|
||||
|
||||
def get_thumbnail(self):
|
||||
"""
|
||||
Returns the path to a file we can use as a thumbnail for this document.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_text(self):
|
||||
"""
|
||||
Returns the text from the document and only the text.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def log(self, level, message):
|
||||
getattr(self.logger, level)(message, extra={
|
||||
"group": self.logging_group
|
||||
})
|
||||
|
||||
def cleanup(self):
|
||||
self.log("debug", "Deleting directory {}".format(self.tempdir))
|
||||
shutil.rmtree(self.tempdir)
|
||||
Loading…
Add table
Add a link
Reference in a new issue