Enhancement: Limit excessively long content length when computing suggestions (#10656)

This helps prevent excessive processing times on very large documents by limiting the text analyzed during date parsing, tag prediction, and correspondent matching. If the document exceeds 1.2M chars, crop to 1M char.
2025-12-13 18:17:02 +01:00 · 2025-09-09 22:02:16 +02:00 · 2025-09-09 22:02:16 +02:00 · 8adc26e09d
commit 8adc26e09d
parent 84d85d7a23
3 changed files with 65 additions and 5 deletions
--- a/src/documents/tests/test_document_model.py
+++ b/src/documents/tests/test_document_model.py
@ -6,6 +6,7 @@ from unittest import mock

 from django.test import TestCase
 from django.test import override_settings
+from faker import Faker

 from documents.models import Correspondent
 from documents.models import Document
@ -105,3 +106,27 @@ class TestDocument(TestCase):
            created=date(2020, 12, 25),
        )
        self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
+
+
+def test_suggestion_content():
+    """
+    Check that the document for suggestion is cropped, only if it exceeds the length limit.
+    """
+    fake_text = Faker().text(max_nb_chars=1201000)
+
+    # Do not crop content under 1.2M chars
+    content_under_limit = fake_text[:1200000]
+    doc = Document(
+        title="test",
+        created=date(2025, 6, 1),
+        content=content_under_limit,
+    )
+    assert doc.suggestion_content == content_under_limit
+
+    # If over the limit, crop to 1M char (800K from the beginning, 200K from the end)
+    content_over_limit = fake_text[:1200001]
+    expected_cropped_content = (
+        content_over_limit[:800000] + " " + content_over_limit[-200000:]
+    )
+    doc.content = content_over_limit
+    assert doc.suggestion_content == expected_cropped_content