mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-19 21:16:56 +01:00
Adds better handling for files with invalid utf8 content
This commit is contained in:
parent
350c20d6ab
commit
111960c530
6 changed files with 47 additions and 16 deletions
|
|
@ -122,8 +122,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||
and os.path.isfile(sidecar_file)
|
||||
and settings.OCR_MODE != "redo"
|
||||
):
|
||||
with open(sidecar_file) as f:
|
||||
text = f.read()
|
||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
if "[OCR skipped on page" not in text:
|
||||
# This happens when there's already text in the input file.
|
||||
|
|
@ -155,7 +154,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||
tmp.name,
|
||||
],
|
||||
)
|
||||
text = tmp.read()
|
||||
text = self.read_file_handle_unicode_errors(Path(tmp.name))
|
||||
|
||||
return post_process_text(text)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue