import json
import os
import re
import subprocess

import ocrmypdf
import pdftotext
from PIL import Image
from django.conf import settings
from ocrmypdf import InputFileError, EncryptedPdfError

from documents.parsers import DocumentParser, ParseError, run_convert


class RasterisedDocumentParser(DocumentParser):
    """
    This parser uses Tesseract to try and get some text out of a rasterised
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """

    def get_thumbnail(self, document_path, mime_type):
        """
        The thumbnail of a PDF is just a 500px wide image of the first page.
        """

        out_path = os.path.join(self.tempdir, "convert.png")

        # Run convert to get a decent thumbnail
        try:
            run_convert(density=300,
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
                        trim=False,
                        input_file="{}[0]".format(document_path),
                        output_file=out_path,
                        logging_group=self.logging_group)
        except ParseError:
            # if convert fails, fall back to extracting
            # the first PDF page as a PNG using Ghostscript
            self.log(
                'warning',
                "Thumbnail generation with ImageMagick failed, falling back "
                "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!")
            gs_out_path = os.path.join(self.tempdir, "gs_out.png")
            cmd = [settings.GS_BINARY,
                   "-q",
                   "-sDEVICE=pngalpha",
                   "-o", gs_out_path,
                   document_path]
            if not subprocess.Popen(cmd).wait() == 0:
                raise ParseError("Thumbnail (gs) failed at {}".format(cmd))
            # then run convert on the output from gs
            run_convert(density=300,
                        scale="500x5000>",
                        alpha="remove",
                        strip=True,
                        trim=False,
                        input_file=gs_out_path,
                        output_file=out_path,
                        logging_group=self.logging_group)

        return out_path

    def is_image(self, mime_type):
        return mime_type in [
            "image/png",
            "image/jpeg",
            "image/tiff",
            "image/bmp",
            "image/gif",
        ]

    def get_dpi(self, image):
        try:
            with Image.open(image) as im:
                x, y = im.info['dpi']
                return x
        except Exception as e:
            self.log(
                'warning',
                f"Error while getting DPI from image {image}: {e}")
            return None

    def parse(self, document_path, mime_type):
        text_original = get_text_from_pdf(document_path)
        has_text = text_original and len(text_original) > 50

        if settings.OCR_MODE == "skip_noarchive" and has_text:
            self.text = text_original
            return

        archive_path = os.path.join(self.tempdir, "archive.pdf")

        ocr_args = {
            'input_file': document_path,
            'output_file': archive_path,
            'use_threads': True,
            'jobs': settings.THREADS_PER_WORKER,
            'language': settings.OCR_LANGUAGE,
            'output_type': settings.OCR_OUTPUT_TYPE,
            'progress_bar': False,
            'clean': True
        }

        if settings.OCR_PAGES > 0:
            ocr_args['pages'] = f"1-{settings.OCR_PAGES}"

        # Mode selection.

        if settings.OCR_MODE in ['skip', 'skip_noarchive']:
            ocr_args['skip_text'] = True
        elif settings.OCR_MODE == 'redo':
            ocr_args['redo_ocr'] = True
        elif settings.OCR_MODE == 'force':
            ocr_args['force_ocr'] = True

        if self.is_image(mime_type):
            dpi = self.get_dpi(document_path)
            if dpi:
                self.log(
                    "debug",
                    f"Detected DPI for image {document_path}: {dpi}"
                )
                ocr_args['image_dpi'] = dpi
            elif settings.OCR_IMAGE_DPI:
                ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI
            else:
                raise ParseError(
                    f"Cannot produce archive PDF for image {document_path}, "
                    f"no DPI information is present in this image and "
                    f"OCR_IMAGE_DPI is not set.")

        if settings.OCR_USER_ARGS:
            try:
                user_args = json.loads(settings.OCR_USER_ARGS)
                ocr_args = {**ocr_args, **user_args}
            except Exception as e:
                self.log(
                    "warning",
                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
                    f"they will not be used: {e}")

        # This forces tesseract to use one core per page.
        os.environ['OMP_THREAD_LIMIT'] = "1"

        try:
            self.log("debug",
                     f"Calling OCRmyPDF with {str(ocr_args)}")
            ocrmypdf.ocr(**ocr_args)
            # success! announce results
            self.archive_path = archive_path
            self.text = get_text_from_pdf(archive_path)

        except (InputFileError, EncryptedPdfError) as e:
            # This happens with some PDFs when used with the redo_ocr option.
            # This is not the end of the world, we'll just use what we already
            # have in the document.
            self.text = text_original
            # Also, no archived file.
            if not self.text:
                # However, if we don't have anything, fail:
                raise ParseError(e)

        except Exception as e:
            # Anything else is probably serious.
            raise ParseError(e)

        if not self.text:
            # This may happen for files that don't have any text.
            self.log(
                'warning',
                f"Document {document_path} does not have any text."
                f"This is probably an error or you tried to add an image "
                f"without text, or something is wrong with this document.")
            self.text = ""


def strip_excess_whitespace(text):
    if not text:
        return None

    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(
        r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
    no_trailing_whitespace = re.sub(
        r"([^\S\n\r]+)$", '', no_leading_whitespace)

    # TODO: this needs a rework
    return no_trailing_whitespace.strip()


def get_text_from_pdf(pdf_file):

    with open(pdf_file, "rb") as f:
        try:
            pdf = pdftotext.PDF(f)
        except pdftotext.Error:
            # might not be a PDF file
            return None

    text = "\n".join(pdf)

    return strip_excess_whitespace(text)