2016-03-06 17:26:07 +00:00
|
|
|
import logging
|
2015-12-20 19:23:33 +00:00
|
|
|
import os
|
2022-03-11 10:55:51 -08:00
|
|
|
from pathlib import Path
|
|
|
|
|
from pathlib import PurePath
|
2021-02-21 12:43:55 +01:00
|
|
|
from threading import Thread
|
2020-11-26 17:41:50 +01:00
|
|
|
from time import sleep
|
2015-12-20 19:23:33 +00:00
|
|
|
|
|
|
|
|
from django.conf import settings
|
2022-03-11 10:55:51 -08:00
|
|
|
from django.core.management.base import BaseCommand
|
|
|
|
|
from django.core.management.base import CommandError
|
2020-11-16 18:26:54 +01:00
|
|
|
from django_q.tasks import async_task
|
2020-11-29 15:39:43 +01:00
|
|
|
from documents.models import Tag
|
2020-12-01 15:26:05 +01:00
|
|
|
from documents.parsers import is_file_ext_supported
|
2022-03-11 10:55:51 -08:00
|
|
|
from watchdog.events import FileSystemEventHandler
|
|
|
|
|
from watchdog.observers.polling import PollingObserver
|
2020-11-29 15:39:43 +01:00
|
|
|
|
2018-05-11 14:01:21 +02:00
|
|
|
try:
|
2020-11-29 15:39:43 +01:00
|
|
|
from inotifyrecursive import INotify, flags
|
2018-05-11 14:01:21 +02:00
|
|
|
except ImportError:
|
2018-09-02 20:33:49 +01:00
|
|
|
INotify = flags = None
|
2018-05-11 14:01:21 +02:00
|
|
|
|
2021-02-05 01:10:29 +01:00
|
|
|
logger = logging.getLogger("paperless.management.consumer")
|
2016-01-23 02:33:29 +00:00
|
|
|
|
2020-11-01 23:07:54 +01:00
|
|
|
|
2020-11-29 15:39:43 +01:00
|
|
|
def _tags_from_path(filepath):
|
2021-02-02 23:58:25 +01:00
|
|
|
"""Walk up the directory tree from filepath to CONSUMPTION_DIR
|
2022-02-27 15:26:41 +01:00
|
|
|
and get or create Tag IDs for every directory.
|
2020-11-29 15:39:43 +01:00
|
|
|
"""
|
|
|
|
|
tag_ids = set()
|
2022-02-27 15:26:41 +01:00
|
|
|
path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
|
2020-11-29 15:39:43 +01:00
|
|
|
for part in path_parts:
|
2022-02-27 15:26:41 +01:00
|
|
|
tag_ids.add(
|
2022-03-11 10:55:51 -08:00
|
|
|
Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
|
2022-02-27 15:26:41 +01:00
|
|
|
)
|
2020-11-29 15:39:43 +01:00
|
|
|
|
|
|
|
|
return tag_ids
|
|
|
|
|
|
|
|
|
|
|
2021-08-08 21:29:36 +02:00
|
|
|
def _is_ignored(filepath: str) -> bool:
|
2022-02-27 15:26:41 +01:00
|
|
|
filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
|
|
|
|
|
return any(filepath_relative.match(p) for p in settings.CONSUMER_IGNORE_PATTERNS)
|
2021-05-19 19:56:01 +02:00
|
|
|
|
|
|
|
|
|
2020-11-29 15:39:43 +01:00
|
|
|
def _consume(filepath):
|
2021-05-19 19:56:01 +02:00
|
|
|
if os.path.isdir(filepath) or _is_ignored(filepath):
|
2020-12-01 15:26:05 +01:00
|
|
|
return
|
|
|
|
|
|
2020-11-29 15:39:43 +01:00
|
|
|
if not os.path.isfile(filepath):
|
2022-02-27 15:26:41 +01:00
|
|
|
logger.debug(f"Not consuming file {filepath}: File has moved.")
|
2020-11-29 15:39:43 +01:00
|
|
|
return
|
|
|
|
|
|
2020-12-01 15:26:05 +01:00
|
|
|
if not is_file_ext_supported(os.path.splitext(filepath)[1]):
|
2022-02-27 15:26:41 +01:00
|
|
|
logger.warning(f"Not consuming file {filepath}: Unknown file extension.")
|
2020-12-01 15:26:05 +01:00
|
|
|
return
|
|
|
|
|
|
2020-11-29 15:39:43 +01:00
|
|
|
tag_ids = None
|
2020-11-26 17:41:50 +01:00
|
|
|
try:
|
2020-11-29 15:39:43 +01:00
|
|
|
if settings.CONSUMER_SUBDIRS_AS_TAGS:
|
|
|
|
|
tag_ids = _tags_from_path(filepath)
|
2022-03-11 10:55:51 -08:00
|
|
|
except Exception:
|
2021-02-11 22:16:41 +01:00
|
|
|
logger.exception("Error creating tags from path")
|
2020-11-26 17:41:50 +01:00
|
|
|
|
2020-11-29 15:39:43 +01:00
|
|
|
try:
|
2021-02-21 12:43:55 +01:00
|
|
|
logger.info(f"Adding {filepath} to the task queue.")
|
2022-02-27 15:26:41 +01:00
|
|
|
async_task(
|
|
|
|
|
"documents.tasks.consume_file",
|
|
|
|
|
filepath,
|
|
|
|
|
override_tag_ids=tag_ids if tag_ids else None,
|
|
|
|
|
task_name=os.path.basename(filepath)[:100],
|
|
|
|
|
)
|
2022-03-11 10:55:51 -08:00
|
|
|
except Exception:
|
2020-11-26 17:41:50 +01:00
|
|
|
# Catch all so that the consumer won't crash.
|
|
|
|
|
# This is also what the test case is listening for to check for
|
|
|
|
|
# errors.
|
2021-02-11 22:16:41 +01:00
|
|
|
logger.exception("Error while consuming document")
|
2020-11-26 17:41:50 +01:00
|
|
|
|
|
|
|
|
|
2021-02-21 12:14:54 +01:00
|
|
|
def _consume_wait_unmodified(file):
|
2021-05-19 19:56:01 +02:00
|
|
|
if _is_ignored(file):
|
|
|
|
|
return
|
|
|
|
|
|
2021-02-21 12:43:55 +01:00
|
|
|
logger.debug(f"Waiting for file {file} to remain unmodified")
|
2020-11-26 17:41:50 +01:00
|
|
|
mtime = -1
|
|
|
|
|
current_try = 0
|
2021-02-21 12:14:54 +01:00
|
|
|
while current_try < settings.CONSUMER_POLLING_RETRY_COUNT:
|
2020-11-26 17:41:50 +01:00
|
|
|
try:
|
|
|
|
|
new_mtime = os.stat(file).st_mtime
|
|
|
|
|
except FileNotFoundError:
|
2022-02-27 15:26:41 +01:00
|
|
|
logger.debug(
|
2022-03-11 10:55:51 -08:00
|
|
|
f"File {file} moved while waiting for it to remain " f"unmodified.",
|
2022-02-27 15:26:41 +01:00
|
|
|
)
|
2020-11-26 17:41:50 +01:00
|
|
|
return
|
|
|
|
|
if new_mtime == mtime:
|
|
|
|
|
_consume(file)
|
|
|
|
|
return
|
|
|
|
|
mtime = new_mtime
|
2021-02-21 12:14:54 +01:00
|
|
|
sleep(settings.CONSUMER_POLLING_DELAY)
|
2020-11-26 17:41:50 +01:00
|
|
|
current_try += 1
|
|
|
|
|
|
|
|
|
|
logger.error(f"Timeout while waiting on file {file} to remain unmodified.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Handler(FileSystemEventHandler):
|
2020-11-01 23:07:54 +01:00
|
|
|
def on_created(self, event):
|
2022-02-27 15:26:41 +01:00
|
|
|
Thread(target=_consume_wait_unmodified, args=(event.src_path,)).start()
|
2020-11-12 09:30:04 +01:00
|
|
|
|
|
|
|
|
def on_moved(self, event):
|
2022-02-27 15:26:41 +01:00
|
|
|
Thread(target=_consume_wait_unmodified, args=(event.dest_path,)).start()
|
2020-11-01 23:07:54 +01:00
|
|
|
|
|
|
|
|
|
2016-02-14 16:09:52 +00:00
|
|
|
class Command(BaseCommand):
|
2015-12-20 19:23:33 +00:00
|
|
|
"""
|
2016-02-06 17:05:36 +00:00
|
|
|
On every iteration of an infinite loop, consume what we can from the
|
2020-11-15 23:56:08 +01:00
|
|
|
consumption directory.
|
2015-12-20 19:23:33 +00:00
|
|
|
"""
|
|
|
|
|
|
2020-11-26 17:41:50 +01:00
|
|
|
# This is here primarily for the tests and is irrelevant in production.
|
|
|
|
|
stop_flag = False
|
|
|
|
|
|
2021-02-05 01:10:29 +01:00
|
|
|
observer = None
|
2015-12-20 19:23:33 +00:00
|
|
|
|
2018-02-24 20:32:19 +01:00
|
|
|
def add_arguments(self, parser):
|
2018-02-25 19:20:51 +01:00
|
|
|
parser.add_argument(
|
|
|
|
|
"directory",
|
|
|
|
|
default=settings.CONSUMPTION_DIR,
|
2018-02-26 18:52:46 +01:00
|
|
|
nargs="?",
|
2022-02-27 15:26:41 +01:00
|
|
|
help="The consumption directory.",
|
2020-11-26 17:41:50 +01:00
|
|
|
)
|
2022-02-27 15:26:41 +01:00
|
|
|
parser.add_argument("--oneshot", action="store_true", help="Run only once.")
|
2018-02-24 20:32:19 +01:00
|
|
|
|
2015-12-20 19:23:33 +00:00
|
|
|
def handle(self, *args, **options):
|
2018-02-25 19:20:51 +01:00
|
|
|
directory = options["directory"]
|
2020-11-29 15:39:43 +01:00
|
|
|
recursive = settings.CONSUMER_RECURSIVE
|
2016-01-30 01:18:52 +00:00
|
|
|
|
2020-11-27 13:12:13 +01:00
|
|
|
if not directory:
|
2022-02-27 15:26:41 +01:00
|
|
|
raise CommandError("CONSUMPTION_DIR does not appear to be set.")
|
2020-11-27 13:12:13 +01:00
|
|
|
|
|
|
|
|
if not os.path.isdir(directory):
|
2022-02-27 15:26:41 +01:00
|
|
|
raise CommandError(f"Consumption directory {directory} does not exist")
|
2020-11-27 13:12:13 +01:00
|
|
|
|
2020-11-29 15:39:43 +01:00
|
|
|
if recursive:
|
|
|
|
|
for dirpath, _, filenames in os.walk(directory):
|
|
|
|
|
for filename in filenames:
|
|
|
|
|
filepath = os.path.join(dirpath, filename)
|
|
|
|
|
_consume(filepath)
|
|
|
|
|
else:
|
|
|
|
|
for entry in os.scandir(directory):
|
|
|
|
|
_consume(entry.path)
|
2020-11-01 23:07:54 +01:00
|
|
|
|
2020-11-26 17:41:50 +01:00
|
|
|
if options["oneshot"]:
|
|
|
|
|
return
|
|
|
|
|
|
2021-01-21 22:29:47 +01:00
|
|
|
if settings.CONSUMER_POLLING == 0 and INotify:
|
2021-01-20 11:56:09 +01:00
|
|
|
self.handle_inotify(directory, recursive)
|
2020-11-16 18:52:13 +01:00
|
|
|
else:
|
2021-01-20 11:56:09 +01:00
|
|
|
self.handle_polling(directory, recursive)
|
2020-11-26 17:41:50 +01:00
|
|
|
|
|
|
|
|
logger.debug("Consumer exiting.")
|
|
|
|
|
|
2021-01-20 11:56:09 +01:00
|
|
|
def handle_polling(self, directory, recursive):
|
2022-02-27 15:26:41 +01:00
|
|
|
logger.info(f"Polling directory for changes: {directory}")
|
2021-01-20 11:56:09 +01:00
|
|
|
self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
|
2020-11-29 15:39:43 +01:00
|
|
|
self.observer.schedule(Handler(), directory, recursive=recursive)
|
2020-11-26 17:41:50 +01:00
|
|
|
self.observer.start()
|
|
|
|
|
try:
|
|
|
|
|
while self.observer.is_alive():
|
|
|
|
|
self.observer.join(1)
|
|
|
|
|
if self.stop_flag:
|
|
|
|
|
self.observer.stop()
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
self.observer.stop()
|
|
|
|
|
self.observer.join()
|
|
|
|
|
|
2020-11-29 15:39:43 +01:00
|
|
|
def handle_inotify(self, directory, recursive):
|
2022-02-27 15:26:41 +01:00
|
|
|
logger.info(f"Using inotify to watch directory for changes: {directory}")
|
2020-11-26 17:41:50 +01:00
|
|
|
|
|
|
|
|
inotify = INotify()
|
2020-11-29 15:39:43 +01:00
|
|
|
inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO
|
|
|
|
|
if recursive:
|
|
|
|
|
descriptor = inotify.add_watch_recursive(directory, inotify_flags)
|
|
|
|
|
else:
|
|
|
|
|
descriptor = inotify.add_watch(directory, inotify_flags)
|
|
|
|
|
|
2020-11-01 23:07:54 +01:00
|
|
|
try:
|
2020-11-26 17:41:50 +01:00
|
|
|
while not self.stop_flag:
|
2020-12-01 15:26:05 +01:00
|
|
|
for event in inotify.read(timeout=1000):
|
2020-11-29 15:39:43 +01:00
|
|
|
if recursive:
|
|
|
|
|
path = inotify.get_path(event.wd)
|
|
|
|
|
else:
|
|
|
|
|
path = directory
|
|
|
|
|
filepath = os.path.join(path, event.name)
|
|
|
|
|
_consume(filepath)
|
2020-11-01 23:07:54 +01:00
|
|
|
except KeyboardInterrupt:
|
2020-11-26 17:41:50 +01:00
|
|
|
pass
|
2020-11-27 13:12:34 +01:00
|
|
|
|
|
|
|
|
inotify.rm_watch(descriptor)
|
2020-11-27 13:19:58 +01:00
|
|
|
inotify.close()
|