mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-07 07:15:07 +01:00
Collapsing excess whitespace after OCR
This commit is contained in:
parent
14811a4a49
commit
63de2ca1b0
2 changed files with 22 additions and 1 deletions
|
|
@ -283,7 +283,7 @@ class Consumer(object):
|
|||
r = " ".join(r)
|
||||
|
||||
# Strip out excess white space to allow matching to go smoother
|
||||
return re.sub(r"\s+", " ", r)
|
||||
return strip_excess_whitespace(r)
|
||||
|
||||
def _store(self, text, doc, thumbnail):
|
||||
|
||||
|
|
@ -360,6 +360,13 @@ class Consumer(object):
|
|||
return Document.objects.filter(checksum=checksum).exists()
|
||||
|
||||
|
||||
def strip_excess_whitespace(text):
|
||||
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
|
||||
no_leading_whitespace = re.sub("([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces)
|
||||
no_trailing_whitespace = re.sub("([^\S\n\r]+)$", '', no_leading_whitespace)
|
||||
return no_trailing_whitespace
|
||||
|
||||
|
||||
def image_to_string(args):
|
||||
img, lang = args
|
||||
ocr = pyocr.get_available_tools()[0]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue