mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-12-20 05:26:53 +01:00
Implement Phase 4 advanced OCR: table extraction, handwriting recognition, and form detection
Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
This commit is contained in:
parent
e33974f8f7
commit
02d3962877
6 changed files with 2513 additions and 0 deletions
31
src/documents/ocr/__init__.py
Normal file
31
src/documents/ocr/__init__.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
"""
|
||||
Advanced OCR module for IntelliDocs-ngx.
|
||||
|
||||
This module provides enhanced OCR capabilities including:
|
||||
- Table detection and extraction
|
||||
- Handwriting recognition
|
||||
- Form field detection
|
||||
- Layout analysis
|
||||
|
||||
Lazy imports are used to avoid loading heavy dependencies unless needed.
|
||||
"""
|
||||
|
||||
__all__ = [
|
||||
'TableExtractor',
|
||||
'HandwritingRecognizer',
|
||||
'FormFieldDetector',
|
||||
]
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
"""Lazy import to avoid loading heavy ML models on startup."""
|
||||
if name == 'TableExtractor':
|
||||
from .table_extractor import TableExtractor
|
||||
return TableExtractor
|
||||
elif name == 'HandwritingRecognizer':
|
||||
from .handwriting import HandwritingRecognizer
|
||||
return HandwritingRecognizer
|
||||
elif name == 'FormFieldDetector':
|
||||
from .form_detector import FormFieldDetector
|
||||
return FormFieldDetector
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
493
src/documents/ocr/form_detector.py
Normal file
493
src/documents/ocr/form_detector.py
Normal file
|
|
@ -0,0 +1,493 @@
|
|||
"""
|
||||
Form field detection and recognition.
|
||||
|
||||
This module provides capabilities to:
|
||||
1. Detect form fields (checkboxes, text fields, labels)
|
||||
2. Extract field values
|
||||
3. Map fields to structured data
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FormFieldDetector:
|
||||
"""
|
||||
Detect and extract form fields from document images.
|
||||
|
||||
Supports:
|
||||
- Text field detection
|
||||
- Checkbox detection and state recognition
|
||||
- Label association
|
||||
- Value extraction
|
||||
|
||||
Example:
|
||||
>>> detector = FormFieldDetector()
|
||||
>>> fields = detector.detect_form_fields("form.jpg")
|
||||
>>> for field in fields:
|
||||
... print(f"{field['label']}: {field['value']}")
|
||||
|
||||
>>> # Extract specific field types
|
||||
>>> checkboxes = detector.detect_checkboxes("form.jpg")
|
||||
>>> for cb in checkboxes:
|
||||
... print(f"{cb['label']}: {'✓' if cb['checked'] else '☐'}")
|
||||
"""
|
||||
|
||||
def __init__(self, use_gpu: bool = True):
|
||||
"""
|
||||
Initialize the form field detector.
|
||||
|
||||
Args:
|
||||
use_gpu: Whether to use GPU acceleration if available
|
||||
"""
|
||||
self.use_gpu = use_gpu
|
||||
self._handwriting_recognizer = None
|
||||
|
||||
def _get_handwriting_recognizer(self):
|
||||
"""Lazy load handwriting recognizer for field value extraction."""
|
||||
if self._handwriting_recognizer is None:
|
||||
from .handwriting import HandwritingRecognizer
|
||||
self._handwriting_recognizer = HandwritingRecognizer(use_gpu=self.use_gpu)
|
||||
return self._handwriting_recognizer
|
||||
|
||||
def detect_checkboxes(
|
||||
self,
|
||||
image: Image.Image,
|
||||
min_size: int = 10,
|
||||
max_size: int = 50
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect checkboxes in a form image.
|
||||
|
||||
Args:
|
||||
image: PIL Image object
|
||||
min_size: Minimum checkbox size in pixels
|
||||
max_size: Maximum checkbox size in pixels
|
||||
|
||||
Returns:
|
||||
List of detected checkboxes with state
|
||||
[
|
||||
{
|
||||
'bbox': [x1, y1, x2, y2],
|
||||
'checked': True/False,
|
||||
'confidence': 0.95
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
try:
|
||||
import cv2
|
||||
|
||||
# Convert to OpenCV format
|
||||
img_array = np.array(image)
|
||||
if len(img_array.shape) == 3:
|
||||
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
||||
else:
|
||||
gray = img_array
|
||||
|
||||
# Detect edges
|
||||
edges = cv2.Canny(gray, 50, 150)
|
||||
|
||||
# Find contours
|
||||
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
checkboxes = []
|
||||
for contour in contours:
|
||||
# Get bounding box
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
|
||||
# Check if it looks like a checkbox (square-ish, right size)
|
||||
aspect_ratio = w / h if h > 0 else 0
|
||||
if (min_size <= w <= max_size and
|
||||
min_size <= h <= max_size and
|
||||
0.7 <= aspect_ratio <= 1.3):
|
||||
|
||||
# Extract checkbox region
|
||||
checkbox_region = gray[y:y+h, x:x+w]
|
||||
|
||||
# Determine if checked (look for marks inside)
|
||||
checked, confidence = self._is_checkbox_checked(checkbox_region)
|
||||
|
||||
checkboxes.append({
|
||||
'bbox': [x, y, x+w, y+h],
|
||||
'checked': checked,
|
||||
'confidence': confidence
|
||||
})
|
||||
|
||||
logger.info(f"Detected {len(checkboxes)} checkboxes")
|
||||
return checkboxes
|
||||
|
||||
except ImportError:
|
||||
logger.error("opencv-python not installed. Install with: pip install opencv-python")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error detecting checkboxes: {e}")
|
||||
return []
|
||||
|
||||
def _is_checkbox_checked(self, checkbox_image: np.ndarray) -> Tuple[bool, float]:
|
||||
"""
|
||||
Determine if a checkbox is checked.
|
||||
|
||||
Args:
|
||||
checkbox_image: Grayscale image of checkbox
|
||||
|
||||
Returns:
|
||||
Tuple of (is_checked, confidence)
|
||||
"""
|
||||
try:
|
||||
import cv2
|
||||
|
||||
# Binarize
|
||||
_, binary = cv2.threshold(checkbox_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
|
||||
# Count dark pixels in the center region (where mark would be)
|
||||
h, w = binary.shape
|
||||
center_region = binary[int(h*0.2):int(h*0.8), int(w*0.2):int(w*0.8)]
|
||||
|
||||
if center_region.size == 0:
|
||||
return False, 0.0
|
||||
|
||||
dark_pixel_ratio = np.sum(center_region > 0) / center_region.size
|
||||
|
||||
# If more than 15% of center is dark, consider it checked
|
||||
checked = dark_pixel_ratio > 0.15
|
||||
confidence = min(dark_pixel_ratio * 2, 1.0) # Scale confidence
|
||||
|
||||
return checked, confidence
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking checkbox state: {e}")
|
||||
return False, 0.0
|
||||
|
||||
def detect_text_fields(
|
||||
self,
|
||||
image: Image.Image,
|
||||
min_width: int = 100
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect text input fields in a form.
|
||||
|
||||
Args:
|
||||
image: PIL Image object
|
||||
min_width: Minimum field width in pixels
|
||||
|
||||
Returns:
|
||||
List of detected text fields
|
||||
[
|
||||
{
|
||||
'bbox': [x1, y1, x2, y2],
|
||||
'type': 'line' or 'box'
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
try:
|
||||
import cv2
|
||||
|
||||
# Convert to OpenCV format
|
||||
img_array = np.array(image)
|
||||
if len(img_array.shape) == 3:
|
||||
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
||||
else:
|
||||
gray = img_array
|
||||
|
||||
# Detect horizontal lines (underlines for text fields)
|
||||
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (min_width, 1))
|
||||
detect_horizontal = cv2.morphologyEx(
|
||||
cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1],
|
||||
cv2.MORPH_OPEN,
|
||||
horizontal_kernel,
|
||||
iterations=2
|
||||
)
|
||||
|
||||
# Find contours of horizontal lines
|
||||
contours, _ = cv2.findContours(
|
||||
detect_horizontal,
|
||||
cv2.RETR_EXTERNAL,
|
||||
cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
|
||||
text_fields = []
|
||||
for contour in contours:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
|
||||
# Check if it's a horizontal line (field underline)
|
||||
if w >= min_width and h < 10:
|
||||
# Expand upward to include text area
|
||||
text_bbox = [x, max(0, y-30), x+w, y+h]
|
||||
text_fields.append({
|
||||
'bbox': text_bbox,
|
||||
'type': 'line'
|
||||
})
|
||||
|
||||
# Detect rectangular boxes (bordered text fields)
|
||||
edges = cv2.Canny(gray, 50, 150)
|
||||
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
for contour in contours:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
|
||||
# Check if it's a rectangular box
|
||||
aspect_ratio = w / h if h > 0 else 0
|
||||
if w >= min_width and 20 <= h <= 100 and aspect_ratio > 2:
|
||||
text_fields.append({
|
||||
'bbox': [x, y, x+w, y+h],
|
||||
'type': 'box'
|
||||
})
|
||||
|
||||
logger.info(f"Detected {len(text_fields)} text fields")
|
||||
return text_fields
|
||||
|
||||
except ImportError:
|
||||
logger.error("opencv-python not installed")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error detecting text fields: {e}")
|
||||
return []
|
||||
|
||||
def detect_labels(
|
||||
self,
|
||||
image: Image.Image,
|
||||
field_bboxes: List[List[int]]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect labels near form fields.
|
||||
|
||||
Args:
|
||||
image: PIL Image object
|
||||
field_bboxes: List of field bounding boxes [[x1,y1,x2,y2], ...]
|
||||
|
||||
Returns:
|
||||
List of detected labels with associated field indices
|
||||
"""
|
||||
try:
|
||||
import pytesseract
|
||||
|
||||
# Get all text with bounding boxes
|
||||
ocr_data = pytesseract.image_to_data(
|
||||
image,
|
||||
output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Group text into potential labels
|
||||
labels = []
|
||||
for i, text in enumerate(ocr_data['text']):
|
||||
if text.strip() and len(text.strip()) > 2:
|
||||
x = ocr_data['left'][i]
|
||||
y = ocr_data['top'][i]
|
||||
w = ocr_data['width'][i]
|
||||
h = ocr_data['height'][i]
|
||||
|
||||
label_bbox = [x, y, x+w, y+h]
|
||||
|
||||
# Find closest field
|
||||
closest_field_idx = self._find_closest_field(label_bbox, field_bboxes)
|
||||
|
||||
labels.append({
|
||||
'text': text.strip(),
|
||||
'bbox': label_bbox,
|
||||
'field_index': closest_field_idx
|
||||
})
|
||||
|
||||
return labels
|
||||
|
||||
except ImportError:
|
||||
logger.error("pytesseract not installed")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error detecting labels: {e}")
|
||||
return []
|
||||
|
||||
def _find_closest_field(
|
||||
self,
|
||||
label_bbox: List[int],
|
||||
field_bboxes: List[List[int]]
|
||||
) -> Optional[int]:
|
||||
"""
|
||||
Find the closest field to a label.
|
||||
|
||||
Args:
|
||||
label_bbox: Label bounding box [x1, y1, x2, y2]
|
||||
field_bboxes: List of field bounding boxes
|
||||
|
||||
Returns:
|
||||
Index of closest field, or None if no fields
|
||||
"""
|
||||
if not field_bboxes:
|
||||
return None
|
||||
|
||||
# Calculate center of label
|
||||
label_center_x = (label_bbox[0] + label_bbox[2]) / 2
|
||||
label_center_y = (label_bbox[1] + label_bbox[3]) / 2
|
||||
|
||||
min_distance = float('inf')
|
||||
closest_idx = 0
|
||||
|
||||
for i, field_bbox in enumerate(field_bboxes):
|
||||
# Calculate center of field
|
||||
field_center_x = (field_bbox[0] + field_bbox[2]) / 2
|
||||
field_center_y = (field_bbox[1] + field_bbox[3]) / 2
|
||||
|
||||
# Euclidean distance
|
||||
distance = np.sqrt(
|
||||
(label_center_x - field_center_x)**2 +
|
||||
(label_center_y - field_center_y)**2
|
||||
)
|
||||
|
||||
if distance < min_distance:
|
||||
min_distance = distance
|
||||
closest_idx = i
|
||||
|
||||
return closest_idx
|
||||
|
||||
def detect_form_fields(
|
||||
self,
|
||||
image_path: str,
|
||||
extract_values: bool = True
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect all form fields and extract their values.
|
||||
|
||||
Args:
|
||||
image_path: Path to form image
|
||||
extract_values: Whether to extract field values using OCR
|
||||
|
||||
Returns:
|
||||
List of detected fields with labels and values
|
||||
[
|
||||
{
|
||||
'type': 'text' or 'checkbox',
|
||||
'label': 'Field Label',
|
||||
'value': 'field value' or True/False,
|
||||
'bbox': [x1, y1, x2, y2],
|
||||
'confidence': 0.95
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
try:
|
||||
# Load image
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
|
||||
# Detect different field types
|
||||
text_fields = self.detect_text_fields(image)
|
||||
checkboxes = self.detect_checkboxes(image)
|
||||
|
||||
# Combine all field bboxes for label detection
|
||||
all_field_bboxes = [f['bbox'] for f in text_fields] + [cb['bbox'] for cb in checkboxes]
|
||||
|
||||
# Detect labels
|
||||
labels = self.detect_labels(image, all_field_bboxes)
|
||||
|
||||
# Build results
|
||||
results = []
|
||||
|
||||
# Add text fields
|
||||
for i, field in enumerate(text_fields):
|
||||
# Find associated label
|
||||
label_text = self._find_label_for_field(i, labels, len(text_fields))
|
||||
|
||||
result = {
|
||||
'type': 'text',
|
||||
'label': label_text,
|
||||
'bbox': field['bbox'],
|
||||
}
|
||||
|
||||
# Extract value if requested
|
||||
if extract_values:
|
||||
x1, y1, x2, y2 = field['bbox']
|
||||
field_image = image.crop((x1, y1, x2, y2))
|
||||
|
||||
recognizer = self._get_handwriting_recognizer()
|
||||
value = recognizer.recognize_from_image(field_image, preprocess=True)
|
||||
result['value'] = value.strip()
|
||||
result['confidence'] = recognizer._estimate_confidence(value)
|
||||
|
||||
results.append(result)
|
||||
|
||||
# Add checkboxes
|
||||
for i, checkbox in enumerate(checkboxes):
|
||||
field_idx = len(text_fields) + i
|
||||
label_text = self._find_label_for_field(field_idx, labels, len(all_field_bboxes))
|
||||
|
||||
results.append({
|
||||
'type': 'checkbox',
|
||||
'label': label_text,
|
||||
'value': checkbox['checked'],
|
||||
'bbox': checkbox['bbox'],
|
||||
'confidence': checkbox['confidence']
|
||||
})
|
||||
|
||||
logger.info(f"Detected {len(results)} form fields from {image_path}")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error detecting form fields: {e}")
|
||||
return []
|
||||
|
||||
def _find_label_for_field(
|
||||
self,
|
||||
field_idx: int,
|
||||
labels: List[Dict[str, Any]],
|
||||
total_fields: int
|
||||
) -> str:
|
||||
"""
|
||||
Find the label text for a specific field.
|
||||
|
||||
Args:
|
||||
field_idx: Index of the field
|
||||
labels: List of detected labels
|
||||
total_fields: Total number of fields
|
||||
|
||||
Returns:
|
||||
Label text or empty string if not found
|
||||
"""
|
||||
matching_labels = [
|
||||
label for label in labels
|
||||
if label['field_index'] == field_idx
|
||||
]
|
||||
|
||||
if matching_labels:
|
||||
# Combine multiple label parts if found
|
||||
return ' '.join(label['text'] for label in matching_labels)
|
||||
|
||||
return f"Field_{field_idx + 1}"
|
||||
|
||||
def extract_form_data(
|
||||
self,
|
||||
image_path: str,
|
||||
output_format: str = 'dict'
|
||||
) -> Any:
|
||||
"""
|
||||
Extract all form data as structured output.
|
||||
|
||||
Args:
|
||||
image_path: Path to form image
|
||||
output_format: Output format ('dict', 'json', or 'dataframe')
|
||||
|
||||
Returns:
|
||||
Structured form data in requested format
|
||||
"""
|
||||
# Detect and extract fields
|
||||
fields = self.detect_form_fields(image_path, extract_values=True)
|
||||
|
||||
if output_format == 'dict':
|
||||
# Return as dictionary
|
||||
return {field['label']: field['value'] for field in fields}
|
||||
|
||||
elif output_format == 'json':
|
||||
import json
|
||||
data = {field['label']: field['value'] for field in fields}
|
||||
return json.dumps(data, indent=2)
|
||||
|
||||
elif output_format == 'dataframe':
|
||||
import pandas as pd
|
||||
return pd.DataFrame(fields)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Invalid output format: {output_format}")
|
||||
448
src/documents/ocr/handwriting.py
Normal file
448
src/documents/ocr/handwriting.py
Normal file
|
|
@ -0,0 +1,448 @@
|
|||
"""
|
||||
Handwriting recognition for documents.
|
||||
|
||||
This module provides handwriting OCR capabilities using:
|
||||
1. TrOCR (Transformer-based OCR) for printed and handwritten text
|
||||
2. Custom models fine-tuned for specific handwriting styles
|
||||
3. Confidence scoring for recognition quality
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HandwritingRecognizer:
|
||||
"""
|
||||
Recognize handwritten text from document images.
|
||||
|
||||
Uses transformer-based models (TrOCR) for accurate handwriting recognition.
|
||||
Supports both printed and handwritten text detection.
|
||||
|
||||
Example:
|
||||
>>> recognizer = HandwritingRecognizer()
|
||||
>>> text = recognizer.recognize_from_image("handwritten_note.jpg")
|
||||
>>> print(text)
|
||||
"This is handwritten text..."
|
||||
|
||||
>>> # With line detection
|
||||
>>> lines = recognizer.recognize_lines("form.jpg")
|
||||
>>> for line in lines:
|
||||
... print(f"{line['text']} (confidence: {line['confidence']:.2f})")
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "microsoft/trocr-base-handwritten",
|
||||
use_gpu: bool = True,
|
||||
confidence_threshold: float = 0.5,
|
||||
):
|
||||
"""
|
||||
Initialize the handwriting recognizer.
|
||||
|
||||
Args:
|
||||
model_name: Hugging Face model name
|
||||
Options:
|
||||
- "microsoft/trocr-base-handwritten" (default, good for English)
|
||||
- "microsoft/trocr-large-handwritten" (more accurate, slower)
|
||||
- "microsoft/trocr-base-printed" (for printed text)
|
||||
use_gpu: Whether to use GPU acceleration if available
|
||||
confidence_threshold: Minimum confidence for accepting recognition
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.use_gpu = use_gpu
|
||||
self.confidence_threshold = confidence_threshold
|
||||
self._model = None
|
||||
self._processor = None
|
||||
|
||||
def _load_model(self):
|
||||
"""Lazy load the handwriting recognition model."""
|
||||
if self._model is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
||||
import torch
|
||||
|
||||
logger.info(f"Loading handwriting recognition model: {self.model_name}")
|
||||
|
||||
self._processor = TrOCRProcessor.from_pretrained(self.model_name)
|
||||
self._model = VisionEncoderDecoderModel.from_pretrained(self.model_name)
|
||||
|
||||
# Move to GPU if available and requested
|
||||
if self.use_gpu and torch.cuda.is_available():
|
||||
self._model = self._model.cuda()
|
||||
logger.info("Using GPU for handwriting recognition")
|
||||
else:
|
||||
logger.info("Using CPU for handwriting recognition")
|
||||
|
||||
self._model.eval() # Set to evaluation mode
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to load handwriting model: {e}")
|
||||
logger.error("Please install: pip install transformers torch pillow")
|
||||
raise
|
||||
|
||||
def recognize_from_image(
|
||||
self,
|
||||
image: Image.Image,
|
||||
preprocess: bool = True
|
||||
) -> str:
|
||||
"""
|
||||
Recognize text from a single image.
|
||||
|
||||
Args:
|
||||
image: PIL Image object containing handwritten text
|
||||
preprocess: Whether to preprocess image (contrast, binarization)
|
||||
|
||||
Returns:
|
||||
Recognized text string
|
||||
"""
|
||||
self._load_model()
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
||||
# Preprocess image if requested
|
||||
if preprocess:
|
||||
image = self._preprocess_image(image)
|
||||
|
||||
# Prepare image for model
|
||||
pixel_values = self._processor(images=image, return_tensors="pt").pixel_values
|
||||
|
||||
if self.use_gpu and torch.cuda.is_available():
|
||||
pixel_values = pixel_values.cuda()
|
||||
|
||||
# Generate text
|
||||
with torch.no_grad():
|
||||
generated_ids = self._model.generate(pixel_values)
|
||||
|
||||
# Decode to text
|
||||
text = self._processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||
|
||||
logger.debug(f"Recognized text: {text[:100]}...")
|
||||
return text
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error recognizing handwriting: {e}")
|
||||
return ""
|
||||
|
||||
def _preprocess_image(self, image: Image.Image) -> Image.Image:
|
||||
"""
|
||||
Preprocess image for better recognition.
|
||||
|
||||
Args:
|
||||
image: Input PIL Image
|
||||
|
||||
Returns:
|
||||
Preprocessed PIL Image
|
||||
"""
|
||||
try:
|
||||
from PIL import ImageEnhance, ImageFilter
|
||||
|
||||
# Convert to grayscale
|
||||
if image.mode != 'L':
|
||||
image = image.convert('L')
|
||||
|
||||
# Enhance contrast
|
||||
enhancer = ImageEnhance.Contrast(image)
|
||||
image = enhancer.enhance(2.0)
|
||||
|
||||
# Denoise
|
||||
image = image.filter(ImageFilter.MedianFilter(size=3))
|
||||
|
||||
# Convert back to RGB (required by model)
|
||||
image = image.convert('RGB')
|
||||
|
||||
return image
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error preprocessing image: {e}")
|
||||
return image
|
||||
|
||||
def detect_text_lines(self, image: Image.Image) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect individual text lines in an image.
|
||||
|
||||
Args:
|
||||
image: PIL Image object
|
||||
|
||||
Returns:
|
||||
List of detected lines with bounding boxes
|
||||
[
|
||||
{
|
||||
'bbox': [x1, y1, x2, y2],
|
||||
'image': PIL.Image
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
try:
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
# Convert PIL to OpenCV format
|
||||
img_array = np.array(image)
|
||||
if len(img_array.shape) == 3:
|
||||
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
||||
else:
|
||||
gray = img_array
|
||||
|
||||
# Binarize
|
||||
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||
|
||||
# Find contours
|
||||
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
|
||||
# Get bounding boxes for each contour
|
||||
lines = []
|
||||
for contour in contours:
|
||||
x, y, w, h = cv2.boundingRect(contour)
|
||||
|
||||
# Filter out very small regions
|
||||
if w > 20 and h > 10:
|
||||
# Crop line from original image
|
||||
line_img = image.crop((x, y, x+w, y+h))
|
||||
lines.append({
|
||||
'bbox': [x, y, x+w, y+h],
|
||||
'image': line_img
|
||||
})
|
||||
|
||||
# Sort lines top to bottom
|
||||
lines.sort(key=lambda l: l['bbox'][1])
|
||||
|
||||
logger.info(f"Detected {len(lines)} text lines")
|
||||
return lines
|
||||
|
||||
except ImportError:
|
||||
logger.error("opencv-python not installed. Install with: pip install opencv-python")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error detecting text lines: {e}")
|
||||
return []
|
||||
|
||||
def recognize_lines(
|
||||
self,
|
||||
image_path: str,
|
||||
return_confidence: bool = True
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Recognize text from each line in an image.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
return_confidence: Whether to include confidence scores
|
||||
|
||||
Returns:
|
||||
List of recognized lines with text and metadata
|
||||
[
|
||||
{
|
||||
'text': 'recognized text',
|
||||
'bbox': [x1, y1, x2, y2],
|
||||
'confidence': 0.95
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
try:
|
||||
# Load image
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
|
||||
# Detect lines
|
||||
lines = self.detect_text_lines(image)
|
||||
|
||||
# Recognize each line
|
||||
results = []
|
||||
for i, line in enumerate(lines):
|
||||
logger.debug(f"Recognizing line {i+1}/{len(lines)}")
|
||||
|
||||
text = self.recognize_from_image(line['image'], preprocess=True)
|
||||
|
||||
result = {
|
||||
'text': text,
|
||||
'bbox': line['bbox'],
|
||||
'line_index': i
|
||||
}
|
||||
|
||||
if return_confidence:
|
||||
# Simple confidence based on text length and content
|
||||
confidence = self._estimate_confidence(text)
|
||||
result['confidence'] = confidence
|
||||
|
||||
results.append(result)
|
||||
|
||||
logger.info(f"Recognized {len(results)} lines from {image_path}")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error recognizing lines from {image_path}: {e}")
|
||||
return []
|
||||
|
||||
def _estimate_confidence(self, text: str) -> float:
|
||||
"""
|
||||
Estimate confidence of recognition result.
|
||||
|
||||
Args:
|
||||
text: Recognized text
|
||||
|
||||
Returns:
|
||||
Confidence score (0-1)
|
||||
"""
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
# Factors that indicate good recognition
|
||||
score = 0.5 # Base score
|
||||
|
||||
# Longer text tends to be more reliable
|
||||
if len(text) > 10:
|
||||
score += 0.1
|
||||
if len(text) > 20:
|
||||
score += 0.1
|
||||
|
||||
# Text with alphanumeric characters is more reliable
|
||||
if any(c.isalnum() for c in text):
|
||||
score += 0.1
|
||||
|
||||
# Text with spaces (words) is more reliable
|
||||
if ' ' in text:
|
||||
score += 0.1
|
||||
|
||||
# Penalize if too many special characters
|
||||
special_chars = sum(1 for c in text if not c.isalnum() and not c.isspace())
|
||||
if special_chars / len(text) > 0.5:
|
||||
score -= 0.2
|
||||
|
||||
return max(0.0, min(1.0, score))
|
||||
|
||||
def recognize_from_file(
|
||||
self,
|
||||
image_path: str,
|
||||
mode: str = 'full'
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Recognize handwriting from an image file.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
mode: Recognition mode
|
||||
- 'full': Recognize entire image as one block
|
||||
- 'lines': Detect and recognize individual lines
|
||||
|
||||
Returns:
|
||||
Dictionary with recognized text and metadata
|
||||
"""
|
||||
try:
|
||||
if mode == 'full':
|
||||
# Recognize entire image
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
text = self.recognize_from_image(image, preprocess=True)
|
||||
|
||||
return {
|
||||
'text': text,
|
||||
'mode': 'full',
|
||||
'confidence': self._estimate_confidence(text)
|
||||
}
|
||||
|
||||
elif mode == 'lines':
|
||||
# Recognize line by line
|
||||
lines = self.recognize_lines(image_path, return_confidence=True)
|
||||
|
||||
# Combine all lines
|
||||
full_text = '\n'.join(line['text'] for line in lines)
|
||||
avg_confidence = np.mean([line['confidence'] for line in lines]) if lines else 0.0
|
||||
|
||||
return {
|
||||
'text': full_text,
|
||||
'lines': lines,
|
||||
'mode': 'lines',
|
||||
'confidence': float(avg_confidence)
|
||||
}
|
||||
|
||||
else:
|
||||
raise ValueError(f"Invalid mode: {mode}. Use 'full' or 'lines'")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error recognizing from file {image_path}: {e}")
|
||||
return {
|
||||
'text': '',
|
||||
'mode': mode,
|
||||
'confidence': 0.0,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def recognize_form_fields(
|
||||
self,
|
||||
image_path: str,
|
||||
field_regions: List[Dict[str, Any]]
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
Recognize text from specific form fields.
|
||||
|
||||
Args:
|
||||
image_path: Path to form image
|
||||
field_regions: List of field definitions
|
||||
[
|
||||
{
|
||||
'name': 'field_name',
|
||||
'bbox': [x1, y1, x2, y2]
|
||||
},
|
||||
...
|
||||
]
|
||||
|
||||
Returns:
|
||||
Dictionary mapping field names to recognized text
|
||||
"""
|
||||
try:
|
||||
# Load image
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
|
||||
# Extract and recognize each field
|
||||
results = {}
|
||||
for field in field_regions:
|
||||
name = field['name']
|
||||
bbox = field['bbox']
|
||||
|
||||
# Crop field region
|
||||
x1, y1, x2, y2 = bbox
|
||||
field_image = image.crop((x1, y1, x2, y2))
|
||||
|
||||
# Recognize text
|
||||
text = self.recognize_from_image(field_image, preprocess=True)
|
||||
results[name] = text.strip()
|
||||
|
||||
logger.debug(f"Field '{name}': {text[:50]}...")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error recognizing form fields: {e}")
|
||||
return {}
|
||||
|
||||
def batch_recognize(
|
||||
self,
|
||||
image_paths: List[str],
|
||||
mode: str = 'full'
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Recognize handwriting from multiple images in batch.
|
||||
|
||||
Args:
|
||||
image_paths: List of image file paths
|
||||
mode: Recognition mode ('full' or 'lines')
|
||||
|
||||
Returns:
|
||||
List of recognition results
|
||||
"""
|
||||
results = []
|
||||
for i, path in enumerate(image_paths):
|
||||
logger.info(f"Processing image {i+1}/{len(image_paths)}: {path}")
|
||||
result = self.recognize_from_file(path, mode=mode)
|
||||
result['image_path'] = path
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
414
src/documents/ocr/table_extractor.py
Normal file
414
src/documents/ocr/table_extractor.py
Normal file
|
|
@ -0,0 +1,414 @@
|
|||
"""
|
||||
Table detection and extraction from documents.
|
||||
|
||||
This module uses various techniques to detect and extract tables from documents:
|
||||
1. Image-based detection using deep learning (table-transformer)
|
||||
2. PDF structure analysis
|
||||
3. OCR-based table detection
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TableExtractor:
|
||||
"""
|
||||
Extract tables from document images and PDFs.
|
||||
|
||||
Supports multiple extraction methods:
|
||||
- Deep learning-based table detection (table-transformer model)
|
||||
- PDF structure parsing
|
||||
- OCR-based table extraction
|
||||
|
||||
Example:
|
||||
>>> extractor = TableExtractor()
|
||||
>>> tables = extractor.extract_tables_from_image("invoice.png")
|
||||
>>> for table in tables:
|
||||
... print(table['data']) # pandas DataFrame
|
||||
... print(table['bbox']) # bounding box coordinates
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "microsoft/table-transformer-detection",
|
||||
confidence_threshold: float = 0.7,
|
||||
use_gpu: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the table extractor.
|
||||
|
||||
Args:
|
||||
model_name: Hugging Face model name for table detection
|
||||
confidence_threshold: Minimum confidence score for detection (0-1)
|
||||
use_gpu: Whether to use GPU acceleration if available
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.confidence_threshold = confidence_threshold
|
||||
self.use_gpu = use_gpu
|
||||
self._model = None
|
||||
self._processor = None
|
||||
|
||||
def _load_model(self):
|
||||
"""Lazy load the table detection model."""
|
||||
if self._model is not None:
|
||||
return
|
||||
|
||||
try:
|
||||
from transformers import AutoImageProcessor, AutoModelForObjectDetection
|
||||
import torch
|
||||
|
||||
logger.info(f"Loading table detection model: {self.model_name}")
|
||||
|
||||
self._processor = AutoImageProcessor.from_pretrained(self.model_name)
|
||||
self._model = AutoModelForObjectDetection.from_pretrained(self.model_name)
|
||||
|
||||
# Move to GPU if available and requested
|
||||
if self.use_gpu and torch.cuda.is_available():
|
||||
self._model = self._model.cuda()
|
||||
logger.info("Using GPU for table detection")
|
||||
else:
|
||||
logger.info("Using CPU for table detection")
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to load table detection model: {e}")
|
||||
logger.error("Please install required packages: pip install transformers torch pillow")
|
||||
raise
|
||||
|
||||
def detect_tables(self, image: Image.Image) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Detect tables in an image.
|
||||
|
||||
Args:
|
||||
image: PIL Image object
|
||||
|
||||
Returns:
|
||||
List of detected tables with bounding boxes and confidence scores
|
||||
[
|
||||
{
|
||||
'bbox': [x1, y1, x2, y2], # coordinates
|
||||
'score': 0.95, # confidence
|
||||
'label': 'table'
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
self._load_model()
|
||||
|
||||
try:
|
||||
import torch
|
||||
|
||||
# Prepare image
|
||||
inputs = self._processor(images=image, return_tensors="pt")
|
||||
|
||||
if self.use_gpu and torch.cuda.is_available():
|
||||
inputs = {k: v.cuda() for k, v in inputs.items()}
|
||||
|
||||
# Run detection
|
||||
with torch.no_grad():
|
||||
outputs = self._model(**inputs)
|
||||
|
||||
# Post-process results
|
||||
target_sizes = torch.tensor([image.size[::-1]])
|
||||
results = self._processor.post_process_object_detection(
|
||||
outputs,
|
||||
threshold=self.confidence_threshold,
|
||||
target_sizes=target_sizes
|
||||
)[0]
|
||||
|
||||
# Convert to list of dicts
|
||||
tables = []
|
||||
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
|
||||
tables.append({
|
||||
'bbox': box.cpu().tolist(),
|
||||
'score': score.item(),
|
||||
'label': self._model.config.id2label[label.item()]
|
||||
})
|
||||
|
||||
logger.info(f"Detected {len(tables)} tables in image")
|
||||
return tables
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error detecting tables: {e}")
|
||||
return []
|
||||
|
||||
def extract_table_from_region(
|
||||
self,
|
||||
image: Image.Image,
|
||||
bbox: List[float],
|
||||
use_ocr: bool = True
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract table data from a specific region of an image.
|
||||
|
||||
Args:
|
||||
image: PIL Image object
|
||||
bbox: Bounding box [x1, y1, x2, y2]
|
||||
use_ocr: Whether to use OCR for text extraction
|
||||
|
||||
Returns:
|
||||
Extracted table data as dictionary with 'data' (pandas DataFrame)
|
||||
and 'raw_text' keys, or None if extraction failed
|
||||
"""
|
||||
try:
|
||||
# Crop to table region
|
||||
x1, y1, x2, y2 = [int(coord) for coord in bbox]
|
||||
table_image = image.crop((x1, y1, x2, y2))
|
||||
|
||||
if use_ocr:
|
||||
# Use OCR to extract text and structure
|
||||
import pytesseract
|
||||
|
||||
# Get detailed OCR data
|
||||
ocr_data = pytesseract.image_to_data(
|
||||
table_image,
|
||||
output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Reconstruct table structure from OCR data
|
||||
table_data = self._reconstruct_table_from_ocr(ocr_data)
|
||||
|
||||
# Also get raw text
|
||||
raw_text = pytesseract.image_to_string(table_image)
|
||||
|
||||
return {
|
||||
'data': table_data,
|
||||
'raw_text': raw_text,
|
||||
'bbox': bbox,
|
||||
'image_size': table_image.size
|
||||
}
|
||||
else:
|
||||
# Fallback to basic OCR without structure
|
||||
import pytesseract
|
||||
raw_text = pytesseract.image_to_string(table_image)
|
||||
return {
|
||||
'data': None,
|
||||
'raw_text': raw_text,
|
||||
'bbox': bbox,
|
||||
'image_size': table_image.size
|
||||
}
|
||||
|
||||
except ImportError:
|
||||
logger.error("pytesseract not installed. Install with: pip install pytesseract")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting table from region: {e}")
|
||||
return None
|
||||
|
||||
def _reconstruct_table_from_ocr(self, ocr_data: Dict) -> Optional[Any]:
|
||||
"""
|
||||
Reconstruct table structure from OCR output.
|
||||
|
||||
Args:
|
||||
ocr_data: OCR data from pytesseract
|
||||
|
||||
Returns:
|
||||
pandas DataFrame or None if reconstruction failed
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
# Group text by vertical position (rows)
|
||||
rows = {}
|
||||
for i, text in enumerate(ocr_data['text']):
|
||||
if text.strip():
|
||||
top = ocr_data['top'][i]
|
||||
left = ocr_data['left'][i]
|
||||
|
||||
# Group by approximate row (within 20 pixels)
|
||||
row_key = round(top / 20) * 20
|
||||
if row_key not in rows:
|
||||
rows[row_key] = []
|
||||
rows[row_key].append((left, text))
|
||||
|
||||
# Sort rows and create DataFrame
|
||||
table_rows = []
|
||||
for row_y in sorted(rows.keys()):
|
||||
# Sort cells by horizontal position
|
||||
cells = [text for _, text in sorted(rows[row_y])]
|
||||
table_rows.append(cells)
|
||||
|
||||
if table_rows:
|
||||
# Pad rows to same length
|
||||
max_cols = max(len(row) for row in table_rows)
|
||||
table_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
|
||||
|
||||
# Create DataFrame
|
||||
df = pd.DataFrame(table_rows)
|
||||
|
||||
# Try to use first row as header if it looks like one
|
||||
if len(df) > 1:
|
||||
first_row_text = ' '.join(str(x) for x in df.iloc[0])
|
||||
if not any(char.isdigit() for char in first_row_text):
|
||||
df.columns = df.iloc[0]
|
||||
df = df[1:].reset_index(drop=True)
|
||||
|
||||
return df
|
||||
|
||||
return None
|
||||
|
||||
except ImportError:
|
||||
logger.error("pandas not installed. Install with: pip install pandas")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error reconstructing table: {e}")
|
||||
return None
|
||||
|
||||
def extract_tables_from_image(
|
||||
self,
|
||||
image_path: str,
|
||||
output_format: str = 'dataframe'
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract all tables from an image file.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
output_format: 'dataframe' or 'csv' or 'json'
|
||||
|
||||
Returns:
|
||||
List of extracted tables with data and metadata
|
||||
"""
|
||||
try:
|
||||
# Load image
|
||||
image = Image.open(image_path).convert('RGB')
|
||||
|
||||
# Detect tables
|
||||
detections = self.detect_tables(image)
|
||||
|
||||
# Extract data from each table
|
||||
tables = []
|
||||
for i, detection in enumerate(detections):
|
||||
logger.info(f"Extracting table {i+1}/{len(detections)}")
|
||||
|
||||
table_data = self.extract_table_from_region(
|
||||
image,
|
||||
detection['bbox']
|
||||
)
|
||||
|
||||
if table_data:
|
||||
table_data['detection_score'] = detection['score']
|
||||
table_data['table_index'] = i
|
||||
|
||||
# Convert to requested format
|
||||
if output_format == 'csv' and table_data['data'] is not None:
|
||||
table_data['csv'] = table_data['data'].to_csv(index=False)
|
||||
elif output_format == 'json' and table_data['data'] is not None:
|
||||
table_data['json'] = table_data['data'].to_json(orient='records')
|
||||
|
||||
tables.append(table_data)
|
||||
|
||||
logger.info(f"Successfully extracted {len(tables)} tables from {image_path}")
|
||||
return tables
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting tables from image {image_path}: {e}")
|
||||
return []
|
||||
|
||||
def extract_tables_from_pdf(
|
||||
self,
|
||||
pdf_path: str,
|
||||
page_numbers: Optional[List[int]] = None
|
||||
) -> Dict[int, List[Dict[str, Any]]]:
|
||||
"""
|
||||
Extract tables from a PDF document.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to PDF file
|
||||
page_numbers: List of page numbers to process (1-indexed), or None for all pages
|
||||
|
||||
Returns:
|
||||
Dictionary mapping page numbers to lists of extracted tables
|
||||
"""
|
||||
try:
|
||||
from pdf2image import convert_from_path
|
||||
|
||||
logger.info(f"Converting PDF to images: {pdf_path}")
|
||||
|
||||
# Convert PDF pages to images
|
||||
if page_numbers:
|
||||
images = convert_from_path(
|
||||
pdf_path,
|
||||
first_page=min(page_numbers),
|
||||
last_page=max(page_numbers)
|
||||
)
|
||||
else:
|
||||
images = convert_from_path(pdf_path)
|
||||
|
||||
# Extract tables from each page
|
||||
results = {}
|
||||
for i, image in enumerate(images):
|
||||
page_num = page_numbers[i] if page_numbers else i + 1
|
||||
logger.info(f"Processing page {page_num}")
|
||||
|
||||
# Detect and extract tables
|
||||
detections = self.detect_tables(image)
|
||||
tables = []
|
||||
|
||||
for detection in detections:
|
||||
table_data = self.extract_table_from_region(
|
||||
image,
|
||||
detection['bbox']
|
||||
)
|
||||
if table_data:
|
||||
table_data['detection_score'] = detection['score']
|
||||
table_data['page'] = page_num
|
||||
tables.append(table_data)
|
||||
|
||||
if tables:
|
||||
results[page_num] = tables
|
||||
logger.info(f"Found {len(tables)} tables on page {page_num}")
|
||||
|
||||
return results
|
||||
|
||||
except ImportError:
|
||||
logger.error("pdf2image not installed. Install with: pip install pdf2image")
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting tables from PDF: {e}")
|
||||
return {}
|
||||
|
||||
def save_tables_to_excel(
|
||||
self,
|
||||
tables: List[Dict[str, Any]],
|
||||
output_path: str
|
||||
) -> bool:
|
||||
"""
|
||||
Save extracted tables to an Excel file.
|
||||
|
||||
Args:
|
||||
tables: List of table dictionaries with 'data' key containing DataFrame
|
||||
output_path: Path to output Excel file
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
|
||||
for i, table in enumerate(tables):
|
||||
if table.get('data') is not None:
|
||||
sheet_name = f"Table_{i+1}"
|
||||
if 'page' in table:
|
||||
sheet_name = f"Page_{table['page']}_Table_{i+1}"
|
||||
|
||||
table['data'].to_excel(
|
||||
writer,
|
||||
sheet_name=sheet_name,
|
||||
index=False
|
||||
)
|
||||
|
||||
logger.info(f"Saved {len(tables)} tables to {output_path}")
|
||||
return True
|
||||
|
||||
except ImportError:
|
||||
logger.error("openpyxl not installed. Install with: pip install openpyxl")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving tables to Excel: {e}")
|
||||
return False
|
||||
Loading…
Add table
Add a link
Reference in a new issue