mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-04 12:36:07 +01:00
1316 lines
38 KiB
Markdown
1316 lines
38 KiB
Markdown
# IntelliDocs-ngx Improvement Roadmap
|
|
|
|
## Executive Summary
|
|
|
|
This document provides a prioritized roadmap for improving IntelliDocs-ngx with detailed recommendations, implementation plans, and expected outcomes.
|
|
|
|
---
|
|
|
|
## Quick Reference: Priority Matrix
|
|
|
|
| Category | Priority | Effort | Impact | Timeline |
|
|
|----------|----------|--------|--------|----------|
|
|
| Performance Optimization | **High** | Low-Medium | High | 2-3 weeks |
|
|
| Security Hardening | **High** | Medium | High | 3-4 weeks |
|
|
| AI/ML Enhancement | **High** | High | Very High | 4-6 weeks |
|
|
| Advanced OCR | **High** | Medium | High | 3-4 weeks |
|
|
| Mobile Experience | Medium | Very High | Medium | 6-8 weeks |
|
|
| Collaboration Features | Medium | Medium-High | Medium | 4-5 weeks |
|
|
| Integration Expansion | Medium | Medium | Medium | 3-4 weeks |
|
|
| Analytics & Reporting | Medium | Medium | Medium | 3-4 weeks |
|
|
|
|
---
|
|
|
|
## Part 1: Critical Improvements (Start Immediately)
|
|
|
|
### 1.1 Performance Optimization
|
|
|
|
#### 1.1.1 Database Query Optimization
|
|
|
|
**Current Issues**:
|
|
- N+1 queries in document list endpoint
|
|
- Missing indexes on commonly filtered fields
|
|
- Inefficient JOIN operations
|
|
- Slow full-text search on large datasets
|
|
|
|
**Proposed Solutions**:
|
|
|
|
```python
|
|
# BEFORE (N+1 problem)
|
|
def list_documents(request):
|
|
documents = Document.objects.all()
|
|
for doc in documents:
|
|
correspondent_name = doc.correspondent.name # Extra query each time
|
|
doc_type_name = doc.document_type.name # Extra query each time
|
|
|
|
# AFTER (Optimized)
|
|
def list_documents(request):
|
|
documents = Document.objects.select_related(
|
|
'correspondent',
|
|
'document_type',
|
|
'storage_path',
|
|
'owner'
|
|
).prefetch_related(
|
|
'tags',
|
|
'custom_fields'
|
|
).all()
|
|
```
|
|
|
|
**Database Migrations Needed**:
|
|
|
|
```python
|
|
# Migration: Add composite indexes
|
|
class Migration(migrations.Migration):
|
|
operations = [
|
|
migrations.AddIndex(
|
|
model_name='document',
|
|
index=models.Index(
|
|
fields=['correspondent', 'created'],
|
|
name='doc_corr_created_idx'
|
|
)
|
|
),
|
|
migrations.AddIndex(
|
|
model_name='document',
|
|
index=models.Index(
|
|
fields=['document_type', 'created'],
|
|
name='doc_type_created_idx'
|
|
)
|
|
),
|
|
migrations.AddIndex(
|
|
model_name='document',
|
|
index=models.Index(
|
|
fields=['owner', 'created'],
|
|
name='doc_owner_created_idx'
|
|
)
|
|
),
|
|
# Full-text search optimization
|
|
migrations.RunSQL(
|
|
"CREATE INDEX doc_content_idx ON documents_document "
|
|
"USING gin(to_tsvector('english', content));"
|
|
),
|
|
]
|
|
```
|
|
|
|
**Expected Results**:
|
|
- 5-10x faster document list queries
|
|
- 3-5x faster search queries
|
|
- Reduced database CPU usage by 40-60%
|
|
|
|
**Implementation Time**: 1 week
|
|
|
|
---
|
|
|
|
#### 1.1.2 Caching Strategy
|
|
|
|
**Redis Caching Implementation**:
|
|
|
|
```python
|
|
# documents/caching.py
|
|
from django.core.cache import cache
|
|
from django.db.models.signals import post_save, post_delete
|
|
from functools import wraps
|
|
|
|
def cache_document_metadata(timeout=3600):
|
|
"""Cache document metadata for 1 hour"""
|
|
def decorator(func):
|
|
@wraps(func)
|
|
def wrapper(document_id, *args, **kwargs):
|
|
cache_key = f'doc_metadata_{document_id}'
|
|
result = cache.get(cache_key)
|
|
if result is None:
|
|
result = func(document_id, *args, **kwargs)
|
|
cache.set(cache_key, result, timeout)
|
|
return result
|
|
return wrapper
|
|
return decorator
|
|
|
|
# Invalidate cache on document changes
|
|
@receiver(post_save, sender=Document)
|
|
def invalidate_document_cache(sender, instance, **kwargs):
|
|
cache_keys = [
|
|
f'doc_metadata_{instance.id}',
|
|
f'doc_thumbnail_{instance.id}',
|
|
f'doc_preview_{instance.id}',
|
|
]
|
|
cache.delete_many(cache_keys)
|
|
|
|
# Cache correspondent/tag lists (rarely change)
|
|
def get_correspondent_list():
|
|
cache_key = 'correspondent_list'
|
|
result = cache.get(cache_key)
|
|
if result is None:
|
|
result = list(Correspondent.objects.all().values('id', 'name'))
|
|
cache.set(cache_key, result, 3600 * 24) # 24 hours
|
|
return result
|
|
```
|
|
|
|
**Configuration**:
|
|
|
|
```python
|
|
# settings.py
|
|
CACHES = {
|
|
'default': {
|
|
'BACKEND': 'django_redis.cache.RedisCache',
|
|
'LOCATION': 'redis://redis:6379/1',
|
|
'OPTIONS': {
|
|
'CLIENT_CLASS': 'django_redis.client.DefaultClient',
|
|
'PARSER_CLASS': 'redis.connection.HiredisParser',
|
|
'CONNECTION_POOL_CLASS_KWARGS': {
|
|
'max_connections': 50,
|
|
}
|
|
},
|
|
'KEY_PREFIX': 'intellidocs',
|
|
'TIMEOUT': 3600,
|
|
}
|
|
}
|
|
```
|
|
|
|
**Expected Results**:
|
|
- 10x faster metadata queries
|
|
- 50% reduction in database load
|
|
- Better scalability for concurrent users
|
|
|
|
**Implementation Time**: 1 week
|
|
|
|
---
|
|
|
|
#### 1.1.3 Frontend Performance
|
|
|
|
**Lazy Loading and Code Splitting**:
|
|
|
|
```typescript
|
|
// app-routing.module.ts - Implement lazy loading
|
|
const routes: Routes = [
|
|
{
|
|
path: 'documents',
|
|
loadChildren: () => import('./documents/documents.module')
|
|
.then(m => m.DocumentsModule)
|
|
},
|
|
{
|
|
path: 'settings',
|
|
loadChildren: () => import('./settings/settings.module')
|
|
.then(m => m.SettingsModule)
|
|
},
|
|
// ... other routes
|
|
];
|
|
```
|
|
|
|
**Virtual Scrolling for Large Lists**:
|
|
|
|
```typescript
|
|
// document-list.component.ts
|
|
import { ScrollingModule } from '@angular/cdk/scrolling';
|
|
|
|
@Component({
|
|
template: `
|
|
<cdk-virtual-scroll-viewport itemSize="100" class="document-list">
|
|
<div *cdkVirtualFor="let document of documents" class="document-item">
|
|
<app-document-card [document]="document"></app-document-card>
|
|
</div>
|
|
</cdk-virtual-scroll-viewport>
|
|
`
|
|
})
|
|
export class DocumentListComponent {
|
|
// Only renders visible items + buffer
|
|
}
|
|
```
|
|
|
|
**Image Optimization**:
|
|
|
|
```typescript
|
|
// Add WebP thumbnail support
|
|
getOptimizedThumbnailUrl(documentId: number): string {
|
|
// Check browser WebP support
|
|
if (this.supportsWebP()) {
|
|
return `/api/documents/${documentId}/thumb/?format=webp`;
|
|
}
|
|
return `/api/documents/${documentId}/thumb/`;
|
|
}
|
|
|
|
// Progressive loading
|
|
loadThumbnail(documentId: number): void {
|
|
// Load low-quality placeholder first
|
|
this.thumbnailUrl = `/api/documents/${documentId}/thumb/?quality=10`;
|
|
|
|
// Then load high-quality version
|
|
const img = new Image();
|
|
img.onload = () => {
|
|
this.thumbnailUrl = `/api/documents/${documentId}/thumb/?quality=85`;
|
|
};
|
|
img.src = `/api/documents/${documentId}/thumb/?quality=85`;
|
|
}
|
|
```
|
|
|
|
**Expected Results**:
|
|
- 50% faster initial page load (2-4s → 1-2s)
|
|
- 60% smaller bundle size
|
|
- Smooth scrolling with 10,000+ documents
|
|
|
|
**Implementation Time**: 1 week
|
|
|
|
---
|
|
|
|
### 1.2 Security Hardening
|
|
|
|
#### 1.2.1 Implement Document Encryption at Rest
|
|
|
|
**Purpose**: Protect sensitive documents from unauthorized access.
|
|
|
|
**Implementation**:
|
|
|
|
```python
|
|
# documents/encryption.py
|
|
from cryptography.fernet import Fernet
|
|
from django.conf import settings
|
|
import base64
|
|
|
|
class DocumentEncryption:
|
|
"""Handle document encryption/decryption"""
|
|
|
|
def __init__(self):
|
|
# Key should be stored in secure key management system
|
|
self.cipher = Fernet(settings.DOCUMENT_ENCRYPTION_KEY)
|
|
|
|
def encrypt_file(self, file_path: str) -> str:
|
|
"""Encrypt a document file"""
|
|
with open(file_path, 'rb') as f:
|
|
plaintext = f.read()
|
|
|
|
ciphertext = self.cipher.encrypt(plaintext)
|
|
|
|
encrypted_path = f"{file_path}.encrypted"
|
|
with open(encrypted_path, 'wb') as f:
|
|
f.write(ciphertext)
|
|
|
|
return encrypted_path
|
|
|
|
def decrypt_file(self, encrypted_path: str, output_path: str = None):
|
|
"""Decrypt a document file"""
|
|
with open(encrypted_path, 'rb') as f:
|
|
ciphertext = f.read()
|
|
|
|
plaintext = self.cipher.decrypt(ciphertext)
|
|
|
|
if output_path:
|
|
with open(output_path, 'wb') as f:
|
|
f.write(plaintext)
|
|
return output_path
|
|
|
|
return plaintext
|
|
|
|
def decrypt_stream(self, encrypted_path: str):
|
|
"""Decrypt file as a stream for serving"""
|
|
import io
|
|
plaintext = self.decrypt_file(encrypted_path)
|
|
return io.BytesIO(plaintext)
|
|
|
|
# Integrate into consumer
|
|
class Consumer:
|
|
def _write(self, document, path, ...):
|
|
# ... existing code ...
|
|
|
|
if settings.ENABLE_DOCUMENT_ENCRYPTION:
|
|
encryption = DocumentEncryption()
|
|
# Encrypt original file
|
|
encrypted_path = encryption.encrypt_file(source_path)
|
|
os.rename(encrypted_path, source_path)
|
|
|
|
# Encrypt archive file
|
|
if archive_path:
|
|
encrypted_archive = encryption.encrypt_file(archive_path)
|
|
os.rename(encrypted_archive, archive_path)
|
|
```
|
|
|
|
**Configuration**:
|
|
|
|
```python
|
|
# settings.py
|
|
ENABLE_DOCUMENT_ENCRYPTION = get_env_bool('PAPERLESS_ENABLE_ENCRYPTION', False)
|
|
DOCUMENT_ENCRYPTION_KEY = os.environ.get('PAPERLESS_ENCRYPTION_KEY')
|
|
|
|
# Key rotation support
|
|
DOCUMENT_ENCRYPTION_KEY_VERSION = get_env_int('PAPERLESS_ENCRYPTION_KEY_VERSION', 1)
|
|
```
|
|
|
|
**Key Management**:
|
|
|
|
```bash
|
|
# Generate encryption key
|
|
python manage.py generate_encryption_key
|
|
|
|
# Rotate keys (re-encrypt all documents)
|
|
python manage.py rotate_encryption_key --old-key-version 1 --new-key-version 2
|
|
```
|
|
|
|
**Expected Results**:
|
|
- Documents protected at rest
|
|
- Compliance with GDPR, HIPAA requirements
|
|
- Minimal performance impact (<5% overhead)
|
|
|
|
**Implementation Time**: 2 weeks
|
|
|
|
---
|
|
|
|
#### 1.2.2 API Rate Limiting
|
|
|
|
**Implementation**:
|
|
|
|
```python
|
|
# paperless/middleware.py
|
|
from django.core.cache import cache
|
|
from django.http import HttpResponse
|
|
import time
|
|
|
|
class RateLimitMiddleware:
|
|
"""Rate limit API requests per user/IP"""
|
|
|
|
def __init__(self, get_response):
|
|
self.get_response = get_response
|
|
|
|
def __call__(self, request):
|
|
if request.path.startswith('/api/'):
|
|
# Get identifier (user ID or IP)
|
|
if request.user.is_authenticated:
|
|
identifier = f'user_{request.user.id}'
|
|
else:
|
|
identifier = f'ip_{self.get_client_ip(request)}'
|
|
|
|
# Check rate limit
|
|
if not self.check_rate_limit(identifier, request.path):
|
|
return HttpResponse(
|
|
'Rate limit exceeded. Please try again later.',
|
|
status=429
|
|
)
|
|
|
|
return self.get_response(request)
|
|
|
|
def check_rate_limit(self, identifier: str, path: str) -> bool:
|
|
"""
|
|
Rate limits:
|
|
- /api/documents/: 100 requests per minute
|
|
- /api/search/: 30 requests per minute
|
|
- /api/upload/: 10 requests per minute
|
|
"""
|
|
rate_limits = {
|
|
'/api/documents/': (100, 60),
|
|
'/api/search/': (30, 60),
|
|
'/api/upload/': (10, 60),
|
|
'default': (200, 60)
|
|
}
|
|
|
|
# Find matching rate limit
|
|
limit, window = rate_limits.get('default')
|
|
for pattern, (l, w) in rate_limits.items():
|
|
if path.startswith(pattern):
|
|
limit, window = l, w
|
|
break
|
|
|
|
# Check cache
|
|
cache_key = f'rate_limit_{identifier}_{path}'
|
|
current = cache.get(cache_key, 0)
|
|
|
|
if current >= limit:
|
|
return False
|
|
|
|
# Increment counter
|
|
cache.set(cache_key, current + 1, window)
|
|
return True
|
|
|
|
def get_client_ip(self, request):
|
|
x_forwarded_for = request.META.get('HTTP_X_FORWARDED_FOR')
|
|
if x_forwarded_for:
|
|
ip = x_forwarded_for.split(',')[0]
|
|
else:
|
|
ip = request.META.get('REMOTE_ADDR')
|
|
return ip
|
|
```
|
|
|
|
**Expected Results**:
|
|
- Protection against DoS attacks
|
|
- Fair resource allocation
|
|
- Better system stability
|
|
|
|
**Implementation Time**: 3 days
|
|
|
|
---
|
|
|
|
#### 1.2.3 Security Headers & CSP
|
|
|
|
```python
|
|
# paperless/middleware.py
|
|
class SecurityHeadersMiddleware:
|
|
"""Add security headers to responses"""
|
|
|
|
def __init__(self, get_response):
|
|
self.get_response = get_response
|
|
|
|
def __call__(self, request):
|
|
response = self.get_response(request)
|
|
|
|
# Strict Transport Security
|
|
response['Strict-Transport-Security'] = 'max-age=31536000; includeSubDomains'
|
|
|
|
# Content Security Policy
|
|
response['Content-Security-Policy'] = (
|
|
"default-src 'self'; "
|
|
"script-src 'self' 'unsafe-inline' 'unsafe-eval'; "
|
|
"style-src 'self' 'unsafe-inline'; "
|
|
"img-src 'self' data: blob:; "
|
|
"font-src 'self' data:; "
|
|
"connect-src 'self' ws: wss:; "
|
|
"frame-ancestors 'none';"
|
|
)
|
|
|
|
# X-Frame-Options (prevent clickjacking)
|
|
response['X-Frame-Options'] = 'DENY'
|
|
|
|
# X-Content-Type-Options
|
|
response['X-Content-Type-Options'] = 'nosniff'
|
|
|
|
# X-XSS-Protection
|
|
response['X-XSS-Protection'] = '1; mode=block'
|
|
|
|
# Referrer Policy
|
|
response['Referrer-Policy'] = 'strict-origin-when-cross-origin'
|
|
|
|
# Permissions Policy
|
|
response['Permissions-Policy'] = (
|
|
'geolocation=(), microphone=(), camera=()'
|
|
)
|
|
|
|
return response
|
|
```
|
|
|
|
**Implementation Time**: 2 days
|
|
|
|
---
|
|
|
|
### 1.3 AI & Machine Learning Enhancements
|
|
|
|
#### 1.3.1 Implement Advanced NLP with Transformers
|
|
|
|
**Current**: LinearSVC with TF-IDF (basic)
|
|
**Proposed**: BERT-based classification (state-of-the-art)
|
|
|
|
**Implementation**:
|
|
|
|
```python
|
|
# documents/ml/transformer_classifier.py
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
from transformers import TrainingArguments, Trainer
|
|
import torch
|
|
from torch.utils.data import Dataset
|
|
|
|
class DocumentDataset(Dataset):
|
|
"""Dataset for document classification"""
|
|
|
|
def __init__(self, documents, labels, tokenizer, max_length=512):
|
|
self.documents = documents
|
|
self.labels = labels
|
|
self.tokenizer = tokenizer
|
|
self.max_length = max_length
|
|
|
|
def __len__(self):
|
|
return len(self.documents)
|
|
|
|
def __getitem__(self, idx):
|
|
doc = self.documents[idx]
|
|
label = self.labels[idx]
|
|
|
|
encoding = self.tokenizer(
|
|
doc.content,
|
|
truncation=True,
|
|
padding='max_length',
|
|
max_length=self.max_length,
|
|
return_tensors='pt'
|
|
)
|
|
|
|
return {
|
|
'input_ids': encoding['input_ids'].flatten(),
|
|
'attention_mask': encoding['attention_mask'].flatten(),
|
|
'labels': torch.tensor(label, dtype=torch.long)
|
|
}
|
|
|
|
class TransformerDocumentClassifier:
|
|
"""BERT-based document classifier"""
|
|
|
|
def __init__(self, model_name='distilbert-base-uncased'):
|
|
self.model_name = model_name
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
self.model = None
|
|
|
|
def train(self, documents, labels):
|
|
"""Train the classifier"""
|
|
# Prepare dataset
|
|
dataset = DocumentDataset(documents, labels, self.tokenizer)
|
|
|
|
# Split train/validation
|
|
train_size = int(0.9 * len(dataset))
|
|
val_size = len(dataset) - train_size
|
|
train_dataset, val_dataset = torch.utils.data.random_split(
|
|
dataset, [train_size, val_size]
|
|
)
|
|
|
|
# Load model
|
|
num_labels = len(set(labels))
|
|
self.model = AutoModelForSequenceClassification.from_pretrained(
|
|
self.model_name,
|
|
num_labels=num_labels
|
|
)
|
|
|
|
# Training arguments
|
|
training_args = TrainingArguments(
|
|
output_dir='./models/document_classifier',
|
|
num_train_epochs=3,
|
|
per_device_train_batch_size=8,
|
|
per_device_eval_batch_size=8,
|
|
warmup_steps=500,
|
|
weight_decay=0.01,
|
|
logging_dir='./logs',
|
|
logging_steps=10,
|
|
evaluation_strategy='epoch',
|
|
save_strategy='epoch',
|
|
load_best_model_at_end=True,
|
|
)
|
|
|
|
# Train
|
|
trainer = Trainer(
|
|
model=self.model,
|
|
args=training_args,
|
|
train_dataset=train_dataset,
|
|
eval_dataset=val_dataset,
|
|
)
|
|
|
|
trainer.train()
|
|
|
|
# Save model
|
|
self.model.save_pretrained('./models/document_classifier_final')
|
|
self.tokenizer.save_pretrained('./models/document_classifier_final')
|
|
|
|
def predict(self, document_text):
|
|
"""Classify a document"""
|
|
if self.model is None:
|
|
self.model = AutoModelForSequenceClassification.from_pretrained(
|
|
'./models/document_classifier_final'
|
|
)
|
|
|
|
# Tokenize
|
|
inputs = self.tokenizer(
|
|
document_text,
|
|
truncation=True,
|
|
padding=True,
|
|
max_length=512,
|
|
return_tensors='pt'
|
|
)
|
|
|
|
# Predict
|
|
with torch.no_grad():
|
|
outputs = self.model(**inputs)
|
|
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
|
|
predicted_class = torch.argmax(predictions, dim=-1).item()
|
|
confidence = predictions[0][predicted_class].item()
|
|
|
|
return predicted_class, confidence
|
|
```
|
|
|
|
**Named Entity Recognition**:
|
|
|
|
```python
|
|
# documents/ml/ner.py
|
|
from transformers import pipeline
|
|
|
|
class DocumentNER:
|
|
"""Extract entities from documents"""
|
|
|
|
def __init__(self):
|
|
self.ner_pipeline = pipeline(
|
|
"ner",
|
|
model="dslim/bert-base-NER",
|
|
aggregation_strategy="simple"
|
|
)
|
|
|
|
def extract_entities(self, text):
|
|
"""Extract named entities"""
|
|
entities = self.ner_pipeline(text)
|
|
|
|
# Organize by type
|
|
organized = {
|
|
'persons': [],
|
|
'organizations': [],
|
|
'locations': [],
|
|
'dates': [],
|
|
'amounts': []
|
|
}
|
|
|
|
for entity in entities:
|
|
entity_type = entity['entity_group']
|
|
if entity_type == 'PER':
|
|
organized['persons'].append(entity['word'])
|
|
elif entity_type == 'ORG':
|
|
organized['organizations'].append(entity['word'])
|
|
elif entity_type == 'LOC':
|
|
organized['locations'].append(entity['word'])
|
|
# Add more entity types...
|
|
|
|
return organized
|
|
|
|
def extract_invoice_data(self, text):
|
|
"""Extract invoice-specific data"""
|
|
# Use regex + NER for better results
|
|
import re
|
|
|
|
data = {}
|
|
|
|
# Extract amounts
|
|
amount_pattern = r'\$?\d+[,\d]*\.?\d{0,2}'
|
|
amounts = re.findall(amount_pattern, text)
|
|
data['amounts'] = amounts
|
|
|
|
# Extract dates
|
|
date_pattern = r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}'
|
|
dates = re.findall(date_pattern, text)
|
|
data['dates'] = dates
|
|
|
|
# Extract invoice numbers
|
|
invoice_pattern = r'(?:Invoice|Inv\.?)\s*#?\s*(\d+)'
|
|
invoice_nums = re.findall(invoice_pattern, text, re.IGNORECASE)
|
|
data['invoice_numbers'] = invoice_nums
|
|
|
|
# Use NER for organization names
|
|
entities = self.extract_entities(text)
|
|
data['organizations'] = entities['organizations']
|
|
|
|
return data
|
|
```
|
|
|
|
**Semantic Search**:
|
|
|
|
```python
|
|
# documents/ml/semantic_search.py
|
|
from sentence_transformers import SentenceTransformer, util
|
|
import numpy as np
|
|
|
|
class SemanticSearch:
|
|
"""Semantic search using embeddings"""
|
|
|
|
def __init__(self):
|
|
self.model = SentenceTransformer('all-MiniLM-L6-v2')
|
|
self.document_embeddings = {}
|
|
|
|
def index_document(self, document_id, text):
|
|
"""Create embedding for document"""
|
|
embedding = self.model.encode(text, convert_to_tensor=True)
|
|
self.document_embeddings[document_id] = embedding
|
|
|
|
def search(self, query, top_k=10):
|
|
"""Search documents by semantic similarity"""
|
|
query_embedding = self.model.encode(query, convert_to_tensor=True)
|
|
|
|
# Calculate similarities
|
|
similarities = []
|
|
for doc_id, doc_embedding in self.document_embeddings.items():
|
|
similarity = util.cos_sim(query_embedding, doc_embedding).item()
|
|
similarities.append((doc_id, similarity))
|
|
|
|
# Sort by similarity
|
|
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
return similarities[:top_k]
|
|
```
|
|
|
|
**Expected Results**:
|
|
- 40-60% improvement in classification accuracy
|
|
- Automatic metadata extraction (dates, amounts, parties)
|
|
- Better search results (semantic understanding)
|
|
- Support for more complex documents
|
|
|
|
**Resource Requirements**:
|
|
- GPU recommended (can use CPU with slower inference)
|
|
- 4-8GB additional RAM for models
|
|
- ~2GB disk space for models
|
|
|
|
**Implementation Time**: 4-6 weeks
|
|
|
|
---
|
|
|
|
### 1.4 Advanced OCR Improvements
|
|
|
|
#### 1.4.1 Table Detection and Extraction
|
|
|
|
**Implementation**:
|
|
|
|
```python
|
|
# paperless_tesseract/table_extraction.py
|
|
import cv2
|
|
import pytesseract
|
|
import pandas as pd
|
|
from pdf2image import convert_from_path
|
|
|
|
class TableExtractor:
|
|
"""Extract tables from documents"""
|
|
|
|
def detect_tables(self, image_path):
|
|
"""Detect table regions in image"""
|
|
img = cv2.imread(image_path, 0)
|
|
|
|
# Thresholding
|
|
thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
|
|
|
# Detect horizontal lines
|
|
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
|
|
detect_horizontal = cv2.morphologyEx(
|
|
thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2
|
|
)
|
|
|
|
# Detect vertical lines
|
|
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
|
|
detect_vertical = cv2.morphologyEx(
|
|
thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2
|
|
)
|
|
|
|
# Combine
|
|
table_mask = cv2.add(detect_horizontal, detect_vertical)
|
|
|
|
# Find contours (table regions)
|
|
contours, _ = cv2.findContours(
|
|
table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
|
|
)
|
|
|
|
tables = []
|
|
for contour in contours:
|
|
x, y, w, h = cv2.boundingRect(contour)
|
|
if w > 100 and h > 100: # Minimum table size
|
|
tables.append((x, y, w, h))
|
|
|
|
return tables
|
|
|
|
def extract_table_data(self, image_path, table_bbox):
|
|
"""Extract data from table region"""
|
|
x, y, w, h = table_bbox
|
|
|
|
# Crop table region
|
|
img = cv2.imread(image_path)
|
|
table_img = img[y:y+h, x:x+w]
|
|
|
|
# OCR with table structure
|
|
data = pytesseract.image_to_data(
|
|
table_img,
|
|
output_type=pytesseract.Output.DICT,
|
|
config='--psm 6' # Assume uniform block of text
|
|
)
|
|
|
|
# Organize into rows and columns
|
|
rows = {}
|
|
for i, text in enumerate(data['text']):
|
|
if text.strip():
|
|
row_num = data['top'][i] // 20 # Group by Y coordinate
|
|
if row_num not in rows:
|
|
rows[row_num] = []
|
|
rows[row_num].append({
|
|
'text': text,
|
|
'left': data['left'][i],
|
|
'confidence': data['conf'][i]
|
|
})
|
|
|
|
# Sort columns by X coordinate
|
|
table_data = []
|
|
for row_num in sorted(rows.keys()):
|
|
row = rows[row_num]
|
|
row.sort(key=lambda x: x['left'])
|
|
table_data.append([cell['text'] for cell in row])
|
|
|
|
return pd.DataFrame(table_data)
|
|
|
|
def extract_all_tables(self, pdf_path):
|
|
"""Extract all tables from PDF"""
|
|
# Convert PDF to images
|
|
images = convert_from_path(pdf_path)
|
|
|
|
all_tables = []
|
|
for page_num, image in enumerate(images):
|
|
# Save temp image
|
|
temp_path = f'/tmp/page_{page_num}.png'
|
|
image.save(temp_path)
|
|
|
|
# Detect tables
|
|
tables = self.detect_tables(temp_path)
|
|
|
|
# Extract each table
|
|
for table_bbox in tables:
|
|
df = self.extract_table_data(temp_path, table_bbox)
|
|
all_tables.append({
|
|
'page': page_num + 1,
|
|
'data': df
|
|
})
|
|
|
|
return all_tables
|
|
```
|
|
|
|
**Expected Results**:
|
|
- Extract structured data from invoices, reports
|
|
- 80-90% accuracy on well-formatted tables
|
|
- Export to CSV/Excel
|
|
- Searchable table contents
|
|
|
|
**Implementation Time**: 2-3 weeks
|
|
|
|
---
|
|
|
|
#### 1.4.2 Handwriting Recognition
|
|
|
|
```python
|
|
# paperless_tesseract/handwriting.py
|
|
from google.cloud import vision
|
|
import os
|
|
|
|
class HandwritingRecognizer:
|
|
"""OCR for handwritten documents"""
|
|
|
|
def __init__(self):
|
|
# Use Google Cloud Vision API (best for handwriting)
|
|
self.client = vision.ImageAnnotatorClient()
|
|
|
|
def recognize_handwriting(self, image_path):
|
|
"""Extract handwritten text"""
|
|
with open(image_path, 'rb') as image_file:
|
|
content = image_file.read()
|
|
|
|
image = vision.Image(content=content)
|
|
|
|
# Use DOCUMENT_TEXT_DETECTION for handwriting
|
|
response = self.client.document_text_detection(image=image)
|
|
|
|
if response.error.message:
|
|
raise Exception(f'Error: {response.error.message}')
|
|
|
|
# Extract text
|
|
full_text = response.full_text_annotation.text
|
|
|
|
# Extract with confidence scores
|
|
pages = []
|
|
for page in response.full_text_annotation.pages:
|
|
page_text = []
|
|
for block in page.blocks:
|
|
for paragraph in block.paragraphs:
|
|
paragraph_text = []
|
|
for word in paragraph.words:
|
|
word_text = ''.join([
|
|
symbol.text for symbol in word.symbols
|
|
])
|
|
confidence = word.confidence
|
|
paragraph_text.append({
|
|
'text': word_text,
|
|
'confidence': confidence
|
|
})
|
|
page_text.append(paragraph_text)
|
|
pages.append(page_text)
|
|
|
|
return {
|
|
'text': full_text,
|
|
'structured': pages
|
|
}
|
|
```
|
|
|
|
**Alternative**: Use Azure Computer Vision or AWS Textract for handwriting
|
|
|
|
**Expected Results**:
|
|
- Support for handwritten notes, forms
|
|
- 70-85% accuracy (depending on handwriting quality)
|
|
- Mixed printed/handwritten text support
|
|
|
|
**Implementation Time**: 2 weeks
|
|
|
|
---
|
|
|
|
## Part 2: Medium Priority Improvements
|
|
|
|
### 2.1 Mobile Experience
|
|
|
|
#### 2.1.1 Native Mobile Apps (React Native)
|
|
|
|
**Why React Native**:
|
|
- Code sharing between iOS and Android
|
|
- Near-native performance
|
|
- Large ecosystem
|
|
- TypeScript support
|
|
|
|
**Core Features**:
|
|
```typescript
|
|
// MobileApp/src/screens/DocumentScanner.tsx
|
|
import { Camera } from 'react-native-vision-camera';
|
|
import DocumentScanner from 'react-native-document-scanner-plugin';
|
|
|
|
export const DocumentScannerScreen = () => {
|
|
const scanDocument = async () => {
|
|
const { scannedImages } = await DocumentScanner.scanDocument({
|
|
maxNumDocuments: 1,
|
|
letUserAdjustCrop: true,
|
|
croppedImageQuality: 100,
|
|
});
|
|
|
|
if (scannedImages && scannedImages.length > 0) {
|
|
// Upload to IntelliDocs
|
|
await uploadDocument(scannedImages[0]);
|
|
}
|
|
};
|
|
|
|
return (
|
|
<View>
|
|
<Button onPress={scanDocument} title="Scan Document" />
|
|
</View>
|
|
);
|
|
};
|
|
|
|
// Offline support
|
|
import AsyncStorage from '@react-native-async-storage/async-storage';
|
|
import NetInfo from '@react-native-community/netinfo';
|
|
|
|
export const DocumentService = {
|
|
uploadDocument: async (file: File) => {
|
|
const isConnected = await NetInfo.fetch().then(
|
|
state => state.isConnected
|
|
);
|
|
|
|
if (!isConnected) {
|
|
// Queue for later
|
|
const queue = await AsyncStorage.getItem('upload_queue') || '[]';
|
|
const queueData = JSON.parse(queue);
|
|
queueData.push({ file, timestamp: Date.now() });
|
|
await AsyncStorage.setItem('upload_queue', JSON.stringify(queueData));
|
|
return { queued: true };
|
|
}
|
|
|
|
// Upload immediately
|
|
return await api.uploadDocument(file);
|
|
}
|
|
};
|
|
```
|
|
|
|
**Implementation Time**: 6-8 weeks
|
|
|
|
---
|
|
|
|
### 2.2 Collaboration Features
|
|
|
|
#### 2.2.1 Document Comments and Annotations
|
|
|
|
```python
|
|
# documents/models.py
|
|
class DocumentComment(models.Model):
|
|
"""Comments on documents"""
|
|
document = models.ForeignKey(Document, related_name='comments')
|
|
user = models.ForeignKey(User)
|
|
text = models.TextField()
|
|
created = models.DateTimeField(auto_now_add=True)
|
|
modified = models.DateTimeField(auto_now=True)
|
|
parent = models.ForeignKey('self', null=True, blank=True) # For replies
|
|
resolved = models.BooleanField(default=False)
|
|
|
|
# For annotations (comments on specific locations)
|
|
page_number = models.IntegerField(null=True)
|
|
position_x = models.FloatField(null=True)
|
|
position_y = models.FloatField(null=True)
|
|
|
|
class DocumentAnnotation(models.Model):
|
|
"""Visual annotations on documents"""
|
|
document = models.ForeignKey(Document, related_name='annotations')
|
|
user = models.ForeignKey(User)
|
|
page_number = models.IntegerField()
|
|
annotation_type = models.CharField(max_length=20) # highlight, rectangle, arrow, text
|
|
data = models.JSONField() # Coordinates, colors, text
|
|
created = models.DateTimeField(auto_now_add=True)
|
|
|
|
# API endpoints
|
|
class DocumentCommentViewSet(viewsets.ModelViewSet):
|
|
def create(self, request, document_pk=None):
|
|
"""Add comment to document"""
|
|
comment = DocumentComment.objects.create(
|
|
document_id=document_pk,
|
|
user=request.user,
|
|
text=request.data['text'],
|
|
page_number=request.data.get('page_number'),
|
|
position_x=request.data.get('position_x'),
|
|
position_y=request.data.get('position_y'),
|
|
)
|
|
|
|
# Notify other users
|
|
notify_document_comment(comment)
|
|
|
|
return Response(CommentSerializer(comment).data)
|
|
```
|
|
|
|
**Frontend**:
|
|
```typescript
|
|
// annotation.component.ts
|
|
export class AnnotationComponent {
|
|
annotations: Annotation[] = [];
|
|
|
|
addHighlight(selection: Selection) {
|
|
const range = selection.getRangeAt(0);
|
|
const rect = range.getBoundingClientRect();
|
|
|
|
const annotation: Annotation = {
|
|
type: 'highlight',
|
|
pageNumber: this.currentPage,
|
|
x: rect.left,
|
|
y: rect.top,
|
|
width: rect.width,
|
|
height: rect.height,
|
|
color: '#FFFF00',
|
|
text: selection.toString()
|
|
};
|
|
|
|
this.documentService.addAnnotation(
|
|
this.documentId,
|
|
annotation
|
|
).subscribe();
|
|
}
|
|
|
|
renderAnnotations() {
|
|
// Overlay annotations on PDF viewer
|
|
this.annotations.forEach(annotation => {
|
|
const element = this.createAnnotationElement(annotation);
|
|
this.pdfContainer.appendChild(element);
|
|
});
|
|
}
|
|
}
|
|
```
|
|
|
|
**Implementation Time**: 3-4 weeks
|
|
|
|
---
|
|
|
|
### 2.3 Integration Expansion
|
|
|
|
#### 2.3.1 Cloud Storage Sync
|
|
|
|
```python
|
|
# documents/integrations/cloud_storage.py
|
|
from dropbox import Dropbox
|
|
from google.oauth2 import service_account
|
|
from googleapiclient.discovery import build
|
|
|
|
class CloudStorageSync:
|
|
"""Sync documents with cloud storage"""
|
|
|
|
def sync_with_dropbox(self, access_token):
|
|
"""Two-way sync with Dropbox"""
|
|
dbx = Dropbox(access_token)
|
|
|
|
# Get files from Dropbox
|
|
result = dbx.files_list_folder('/IntelliDocs')
|
|
|
|
for entry in result.entries:
|
|
if entry.name.endswith('.pdf'):
|
|
# Check if already imported
|
|
if not Document.objects.filter(
|
|
original_filename=entry.name
|
|
).exists():
|
|
# Download and import
|
|
_, response = dbx.files_download(entry.path_display)
|
|
self.import_file(response.content, entry.name)
|
|
|
|
# Upload new documents to Dropbox
|
|
new_docs = Document.objects.filter(
|
|
synced_to_dropbox=False
|
|
)
|
|
for doc in new_docs:
|
|
with open(doc.source_path, 'rb') as f:
|
|
dbx.files_upload(
|
|
f.read(),
|
|
f'/IntelliDocs/{doc.get_public_filename()}'
|
|
)
|
|
doc.synced_to_dropbox = True
|
|
doc.save()
|
|
|
|
def sync_with_google_drive(self, credentials_path):
|
|
"""Sync with Google Drive"""
|
|
credentials = service_account.Credentials.from_service_account_file(
|
|
credentials_path
|
|
)
|
|
service = build('drive', 'v3', credentials=credentials)
|
|
|
|
# List files in Drive folder
|
|
results = service.files().list(
|
|
q="'folder_id' in parents",
|
|
fields="files(id, name, mimeType)"
|
|
).execute()
|
|
|
|
for item in results.get('files', []):
|
|
# Download and import
|
|
request = service.files().get_media(fileId=item['id'])
|
|
# ... import logic
|
|
```
|
|
|
|
**Implementation Time**: 2-3 weeks per integration
|
|
|
|
---
|
|
|
|
### 2.4 Analytics & Reporting
|
|
|
|
```python
|
|
# documents/analytics.py
|
|
from django.db.models import Count, Avg, Sum
|
|
from django.utils import timezone
|
|
from datetime import timedelta
|
|
|
|
class DocumentAnalytics:
|
|
"""Generate analytics and reports"""
|
|
|
|
def get_dashboard_stats(self, user=None):
|
|
"""Get overview statistics"""
|
|
queryset = Document.objects.all()
|
|
if user:
|
|
queryset = queryset.filter(owner=user)
|
|
|
|
stats = {
|
|
'total_documents': queryset.count(),
|
|
'documents_this_month': queryset.filter(
|
|
created__gte=timezone.now() - timedelta(days=30)
|
|
).count(),
|
|
'total_pages': queryset.aggregate(
|
|
Sum('page_count')
|
|
)['page_count__sum'] or 0,
|
|
'storage_used': queryset.aggregate(
|
|
Sum('original_size')
|
|
)['original_size__sum'] or 0,
|
|
}
|
|
|
|
# Documents by type
|
|
stats['by_type'] = queryset.values(
|
|
'document_type__name'
|
|
).annotate(
|
|
count=Count('id')
|
|
).order_by('-count')
|
|
|
|
# Documents by correspondent
|
|
stats['by_correspondent'] = queryset.values(
|
|
'correspondent__name'
|
|
).annotate(
|
|
count=Count('id')
|
|
).order_by('-count')[:10]
|
|
|
|
# Upload trend (last 12 months)
|
|
upload_trend = []
|
|
for i in range(12):
|
|
date = timezone.now() - timedelta(days=30 * i)
|
|
count = queryset.filter(
|
|
created__year=date.year,
|
|
created__month=date.month
|
|
).count()
|
|
upload_trend.append({
|
|
'month': date.strftime('%B %Y'),
|
|
'count': count
|
|
})
|
|
stats['upload_trend'] = list(reversed(upload_trend))
|
|
|
|
return stats
|
|
|
|
def generate_report(self, report_type, start_date, end_date, filters=None):
|
|
"""Generate custom reports"""
|
|
queryset = Document.objects.filter(
|
|
created__gte=start_date,
|
|
created__lte=end_date
|
|
)
|
|
|
|
if filters:
|
|
if 'correspondent' in filters:
|
|
queryset = queryset.filter(correspondent_id=filters['correspondent'])
|
|
if 'document_type' in filters:
|
|
queryset = queryset.filter(document_type_id=filters['document_type'])
|
|
|
|
if report_type == 'summary':
|
|
return self._generate_summary_report(queryset)
|
|
elif report_type == 'detailed':
|
|
return self._generate_detailed_report(queryset)
|
|
elif report_type == 'compliance':
|
|
return self._generate_compliance_report(queryset)
|
|
|
|
def export_report(self, report_data, format='pdf'):
|
|
"""Export report to PDF/Excel"""
|
|
if format == 'pdf':
|
|
return self._export_to_pdf(report_data)
|
|
elif format == 'xlsx':
|
|
return self._export_to_excel(report_data)
|
|
elif format == 'csv':
|
|
return self._export_to_csv(report_data)
|
|
```
|
|
|
|
**Frontend Dashboard**:
|
|
```typescript
|
|
// analytics-dashboard.component.ts
|
|
export class AnalyticsDashboardComponent implements OnInit {
|
|
stats: DashboardStats;
|
|
chartOptions: any;
|
|
|
|
ngOnInit() {
|
|
this.analyticsService.getDashboardStats().subscribe(stats => {
|
|
this.stats = stats;
|
|
this.setupCharts();
|
|
});
|
|
}
|
|
|
|
setupCharts() {
|
|
// Upload trend chart
|
|
this.chartOptions = {
|
|
series: [{
|
|
name: 'Documents',
|
|
data: this.stats.upload_trend.map(d => d.count)
|
|
}],
|
|
chart: {
|
|
type: 'area',
|
|
height: 350
|
|
},
|
|
xaxis: {
|
|
categories: this.stats.upload_trend.map(d => d.month)
|
|
}
|
|
};
|
|
}
|
|
|
|
generateReport(type: string) {
|
|
this.analyticsService.generateReport(type, {
|
|
start_date: this.startDate,
|
|
end_date: this.endDate,
|
|
filters: this.filters
|
|
}).subscribe(blob => {
|
|
saveAs(blob, `report_${type}.pdf`);
|
|
});
|
|
}
|
|
}
|
|
```
|
|
|
|
**Implementation Time**: 3-4 weeks
|
|
|
|
---
|
|
|
|
## Part 3: Long-term Vision
|
|
|
|
### 3.1 Advanced Features Roadmap (6-12 months)
|
|
|
|
1. **Blockchain Integration** for document timestamping and immutability
|
|
2. **Advanced Compliance** (ISO 15489, DOD 5015.2)
|
|
3. **Records Retention Automation** with legal holds
|
|
4. **Multi-tenancy** support for SaaS deployments
|
|
5. **Advanced Workflow** with visual designer
|
|
6. **Custom Plugins** system for extensions
|
|
7. **GraphQL API** alongside REST
|
|
8. **Real-time Collaboration** (Google Docs-style)
|
|
|
|
---
|
|
|
|
## Conclusion
|
|
|
|
This roadmap provides a clear path to significantly improve IntelliDocs-ngx. Start with:
|
|
|
|
1. **Week 1-2**: Performance optimization (quick wins)
|
|
2. **Week 3-4**: Security hardening
|
|
3. **Week 5-10**: AI/ML enhancements
|
|
4. **Week 11-14**: Advanced OCR
|
|
5. **Month 4-6**: Mobile & collaboration features
|
|
|
|
Each improvement has been detailed with implementation code, expected results, and time estimates. Prioritize based on your users' needs and available resources.
|
|
|
|
---
|
|
|
|
*Generated: 2025-11-09*
|
|
*For: IntelliDocs-ngx v2.19.5*
|