Implement Phase 4 advanced OCR: table extraction, handwriting recognition, and form detection

Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2025-11-09 17:49:14 +00:00
parent e33974f8f7
commit 02d3962877
6 changed files with 2513 additions and 0 deletions

View file

@ -0,0 +1,31 @@
"""
Advanced OCR module for IntelliDocs-ngx.
This module provides enhanced OCR capabilities including:
- Table detection and extraction
- Handwriting recognition
- Form field detection
- Layout analysis
Lazy imports are used to avoid loading heavy dependencies unless needed.
"""
__all__ = [
'TableExtractor',
'HandwritingRecognizer',
'FormFieldDetector',
]
def __getattr__(name):
"""Lazy import to avoid loading heavy ML models on startup."""
if name == 'TableExtractor':
from .table_extractor import TableExtractor
return TableExtractor
elif name == 'HandwritingRecognizer':
from .handwriting import HandwritingRecognizer
return HandwritingRecognizer
elif name == 'FormFieldDetector':
from .form_detector import FormFieldDetector
return FormFieldDetector
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

View file

@ -0,0 +1,493 @@
"""
Form field detection and recognition.
This module provides capabilities to:
1. Detect form fields (checkboxes, text fields, labels)
2. Extract field values
3. Map fields to structured data
"""
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
from PIL import Image
logger = logging.getLogger(__name__)
class FormFieldDetector:
"""
Detect and extract form fields from document images.
Supports:
- Text field detection
- Checkbox detection and state recognition
- Label association
- Value extraction
Example:
>>> detector = FormFieldDetector()
>>> fields = detector.detect_form_fields("form.jpg")
>>> for field in fields:
... print(f"{field['label']}: {field['value']}")
>>> # Extract specific field types
>>> checkboxes = detector.detect_checkboxes("form.jpg")
>>> for cb in checkboxes:
... print(f"{cb['label']}: {'' if cb['checked'] else ''}")
"""
def __init__(self, use_gpu: bool = True):
"""
Initialize the form field detector.
Args:
use_gpu: Whether to use GPU acceleration if available
"""
self.use_gpu = use_gpu
self._handwriting_recognizer = None
def _get_handwriting_recognizer(self):
"""Lazy load handwriting recognizer for field value extraction."""
if self._handwriting_recognizer is None:
from .handwriting import HandwritingRecognizer
self._handwriting_recognizer = HandwritingRecognizer(use_gpu=self.use_gpu)
return self._handwriting_recognizer
def detect_checkboxes(
self,
image: Image.Image,
min_size: int = 10,
max_size: int = 50
) -> List[Dict[str, Any]]:
"""
Detect checkboxes in a form image.
Args:
image: PIL Image object
min_size: Minimum checkbox size in pixels
max_size: Maximum checkbox size in pixels
Returns:
List of detected checkboxes with state
[
{
'bbox': [x1, y1, x2, y2],
'checked': True/False,
'confidence': 0.95
},
...
]
"""
try:
import cv2
# Convert to OpenCV format
img_array = np.array(image)
if len(img_array.shape) == 3:
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
else:
gray = img_array
# Detect edges
edges = cv2.Canny(gray, 50, 150)
# Find contours
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
checkboxes = []
for contour in contours:
# Get bounding box
x, y, w, h = cv2.boundingRect(contour)
# Check if it looks like a checkbox (square-ish, right size)
aspect_ratio = w / h if h > 0 else 0
if (min_size <= w <= max_size and
min_size <= h <= max_size and
0.7 <= aspect_ratio <= 1.3):
# Extract checkbox region
checkbox_region = gray[y:y+h, x:x+w]
# Determine if checked (look for marks inside)
checked, confidence = self._is_checkbox_checked(checkbox_region)
checkboxes.append({
'bbox': [x, y, x+w, y+h],
'checked': checked,
'confidence': confidence
})
logger.info(f"Detected {len(checkboxes)} checkboxes")
return checkboxes
except ImportError:
logger.error("opencv-python not installed. Install with: pip install opencv-python")
return []
except Exception as e:
logger.error(f"Error detecting checkboxes: {e}")
return []
def _is_checkbox_checked(self, checkbox_image: np.ndarray) -> Tuple[bool, float]:
"""
Determine if a checkbox is checked.
Args:
checkbox_image: Grayscale image of checkbox
Returns:
Tuple of (is_checked, confidence)
"""
try:
import cv2
# Binarize
_, binary = cv2.threshold(checkbox_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Count dark pixels in the center region (where mark would be)
h, w = binary.shape
center_region = binary[int(h*0.2):int(h*0.8), int(w*0.2):int(w*0.8)]
if center_region.size == 0:
return False, 0.0
dark_pixel_ratio = np.sum(center_region > 0) / center_region.size
# If more than 15% of center is dark, consider it checked
checked = dark_pixel_ratio > 0.15
confidence = min(dark_pixel_ratio * 2, 1.0) # Scale confidence
return checked, confidence
except Exception as e:
logger.warning(f"Error checking checkbox state: {e}")
return False, 0.0
def detect_text_fields(
self,
image: Image.Image,
min_width: int = 100
) -> List[Dict[str, Any]]:
"""
Detect text input fields in a form.
Args:
image: PIL Image object
min_width: Minimum field width in pixels
Returns:
List of detected text fields
[
{
'bbox': [x1, y1, x2, y2],
'type': 'line' or 'box'
},
...
]
"""
try:
import cv2
# Convert to OpenCV format
img_array = np.array(image)
if len(img_array.shape) == 3:
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
else:
gray = img_array
# Detect horizontal lines (underlines for text fields)
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (min_width, 1))
detect_horizontal = cv2.morphologyEx(
cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1],
cv2.MORPH_OPEN,
horizontal_kernel,
iterations=2
)
# Find contours of horizontal lines
contours, _ = cv2.findContours(
detect_horizontal,
cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE
)
text_fields = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
# Check if it's a horizontal line (field underline)
if w >= min_width and h < 10:
# Expand upward to include text area
text_bbox = [x, max(0, y-30), x+w, y+h]
text_fields.append({
'bbox': text_bbox,
'type': 'line'
})
# Detect rectangular boxes (bordered text fields)
edges = cv2.Canny(gray, 50, 150)
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
# Check if it's a rectangular box
aspect_ratio = w / h if h > 0 else 0
if w >= min_width and 20 <= h <= 100 and aspect_ratio > 2:
text_fields.append({
'bbox': [x, y, x+w, y+h],
'type': 'box'
})
logger.info(f"Detected {len(text_fields)} text fields")
return text_fields
except ImportError:
logger.error("opencv-python not installed")
return []
except Exception as e:
logger.error(f"Error detecting text fields: {e}")
return []
def detect_labels(
self,
image: Image.Image,
field_bboxes: List[List[int]]
) -> List[Dict[str, Any]]:
"""
Detect labels near form fields.
Args:
image: PIL Image object
field_bboxes: List of field bounding boxes [[x1,y1,x2,y2], ...]
Returns:
List of detected labels with associated field indices
"""
try:
import pytesseract
# Get all text with bounding boxes
ocr_data = pytesseract.image_to_data(
image,
output_type=pytesseract.Output.DICT
)
# Group text into potential labels
labels = []
for i, text in enumerate(ocr_data['text']):
if text.strip() and len(text.strip()) > 2:
x = ocr_data['left'][i]
y = ocr_data['top'][i]
w = ocr_data['width'][i]
h = ocr_data['height'][i]
label_bbox = [x, y, x+w, y+h]
# Find closest field
closest_field_idx = self._find_closest_field(label_bbox, field_bboxes)
labels.append({
'text': text.strip(),
'bbox': label_bbox,
'field_index': closest_field_idx
})
return labels
except ImportError:
logger.error("pytesseract not installed")
return []
except Exception as e:
logger.error(f"Error detecting labels: {e}")
return []
def _find_closest_field(
self,
label_bbox: List[int],
field_bboxes: List[List[int]]
) -> Optional[int]:
"""
Find the closest field to a label.
Args:
label_bbox: Label bounding box [x1, y1, x2, y2]
field_bboxes: List of field bounding boxes
Returns:
Index of closest field, or None if no fields
"""
if not field_bboxes:
return None
# Calculate center of label
label_center_x = (label_bbox[0] + label_bbox[2]) / 2
label_center_y = (label_bbox[1] + label_bbox[3]) / 2
min_distance = float('inf')
closest_idx = 0
for i, field_bbox in enumerate(field_bboxes):
# Calculate center of field
field_center_x = (field_bbox[0] + field_bbox[2]) / 2
field_center_y = (field_bbox[1] + field_bbox[3]) / 2
# Euclidean distance
distance = np.sqrt(
(label_center_x - field_center_x)**2 +
(label_center_y - field_center_y)**2
)
if distance < min_distance:
min_distance = distance
closest_idx = i
return closest_idx
def detect_form_fields(
self,
image_path: str,
extract_values: bool = True
) -> List[Dict[str, Any]]:
"""
Detect all form fields and extract their values.
Args:
image_path: Path to form image
extract_values: Whether to extract field values using OCR
Returns:
List of detected fields with labels and values
[
{
'type': 'text' or 'checkbox',
'label': 'Field Label',
'value': 'field value' or True/False,
'bbox': [x1, y1, x2, y2],
'confidence': 0.95
},
...
]
"""
try:
# Load image
image = Image.open(image_path).convert('RGB')
# Detect different field types
text_fields = self.detect_text_fields(image)
checkboxes = self.detect_checkboxes(image)
# Combine all field bboxes for label detection
all_field_bboxes = [f['bbox'] for f in text_fields] + [cb['bbox'] for cb in checkboxes]
# Detect labels
labels = self.detect_labels(image, all_field_bboxes)
# Build results
results = []
# Add text fields
for i, field in enumerate(text_fields):
# Find associated label
label_text = self._find_label_for_field(i, labels, len(text_fields))
result = {
'type': 'text',
'label': label_text,
'bbox': field['bbox'],
}
# Extract value if requested
if extract_values:
x1, y1, x2, y2 = field['bbox']
field_image = image.crop((x1, y1, x2, y2))
recognizer = self._get_handwriting_recognizer()
value = recognizer.recognize_from_image(field_image, preprocess=True)
result['value'] = value.strip()
result['confidence'] = recognizer._estimate_confidence(value)
results.append(result)
# Add checkboxes
for i, checkbox in enumerate(checkboxes):
field_idx = len(text_fields) + i
label_text = self._find_label_for_field(field_idx, labels, len(all_field_bboxes))
results.append({
'type': 'checkbox',
'label': label_text,
'value': checkbox['checked'],
'bbox': checkbox['bbox'],
'confidence': checkbox['confidence']
})
logger.info(f"Detected {len(results)} form fields from {image_path}")
return results
except Exception as e:
logger.error(f"Error detecting form fields: {e}")
return []
def _find_label_for_field(
self,
field_idx: int,
labels: List[Dict[str, Any]],
total_fields: int
) -> str:
"""
Find the label text for a specific field.
Args:
field_idx: Index of the field
labels: List of detected labels
total_fields: Total number of fields
Returns:
Label text or empty string if not found
"""
matching_labels = [
label for label in labels
if label['field_index'] == field_idx
]
if matching_labels:
# Combine multiple label parts if found
return ' '.join(label['text'] for label in matching_labels)
return f"Field_{field_idx + 1}"
def extract_form_data(
self,
image_path: str,
output_format: str = 'dict'
) -> Any:
"""
Extract all form data as structured output.
Args:
image_path: Path to form image
output_format: Output format ('dict', 'json', or 'dataframe')
Returns:
Structured form data in requested format
"""
# Detect and extract fields
fields = self.detect_form_fields(image_path, extract_values=True)
if output_format == 'dict':
# Return as dictionary
return {field['label']: field['value'] for field in fields}
elif output_format == 'json':
import json
data = {field['label']: field['value'] for field in fields}
return json.dumps(data, indent=2)
elif output_format == 'dataframe':
import pandas as pd
return pd.DataFrame(fields)
else:
raise ValueError(f"Invalid output format: {output_format}")

View file

@ -0,0 +1,448 @@
"""
Handwriting recognition for documents.
This module provides handwriting OCR capabilities using:
1. TrOCR (Transformer-based OCR) for printed and handwritten text
2. Custom models fine-tuned for specific handwriting styles
3. Confidence scoring for recognition quality
"""
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
from PIL import Image
logger = logging.getLogger(__name__)
class HandwritingRecognizer:
"""
Recognize handwritten text from document images.
Uses transformer-based models (TrOCR) for accurate handwriting recognition.
Supports both printed and handwritten text detection.
Example:
>>> recognizer = HandwritingRecognizer()
>>> text = recognizer.recognize_from_image("handwritten_note.jpg")
>>> print(text)
"This is handwritten text..."
>>> # With line detection
>>> lines = recognizer.recognize_lines("form.jpg")
>>> for line in lines:
... print(f"{line['text']} (confidence: {line['confidence']:.2f})")
"""
def __init__(
self,
model_name: str = "microsoft/trocr-base-handwritten",
use_gpu: bool = True,
confidence_threshold: float = 0.5,
):
"""
Initialize the handwriting recognizer.
Args:
model_name: Hugging Face model name
Options:
- "microsoft/trocr-base-handwritten" (default, good for English)
- "microsoft/trocr-large-handwritten" (more accurate, slower)
- "microsoft/trocr-base-printed" (for printed text)
use_gpu: Whether to use GPU acceleration if available
confidence_threshold: Minimum confidence for accepting recognition
"""
self.model_name = model_name
self.use_gpu = use_gpu
self.confidence_threshold = confidence_threshold
self._model = None
self._processor = None
def _load_model(self):
"""Lazy load the handwriting recognition model."""
if self._model is not None:
return
try:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import torch
logger.info(f"Loading handwriting recognition model: {self.model_name}")
self._processor = TrOCRProcessor.from_pretrained(self.model_name)
self._model = VisionEncoderDecoderModel.from_pretrained(self.model_name)
# Move to GPU if available and requested
if self.use_gpu and torch.cuda.is_available():
self._model = self._model.cuda()
logger.info("Using GPU for handwriting recognition")
else:
logger.info("Using CPU for handwriting recognition")
self._model.eval() # Set to evaluation mode
except ImportError as e:
logger.error(f"Failed to load handwriting model: {e}")
logger.error("Please install: pip install transformers torch pillow")
raise
def recognize_from_image(
self,
image: Image.Image,
preprocess: bool = True
) -> str:
"""
Recognize text from a single image.
Args:
image: PIL Image object containing handwritten text
preprocess: Whether to preprocess image (contrast, binarization)
Returns:
Recognized text string
"""
self._load_model()
try:
import torch
# Preprocess image if requested
if preprocess:
image = self._preprocess_image(image)
# Prepare image for model
pixel_values = self._processor(images=image, return_tensors="pt").pixel_values
if self.use_gpu and torch.cuda.is_available():
pixel_values = pixel_values.cuda()
# Generate text
with torch.no_grad():
generated_ids = self._model.generate(pixel_values)
# Decode to text
text = self._processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
logger.debug(f"Recognized text: {text[:100]}...")
return text
except Exception as e:
logger.error(f"Error recognizing handwriting: {e}")
return ""
def _preprocess_image(self, image: Image.Image) -> Image.Image:
"""
Preprocess image for better recognition.
Args:
image: Input PIL Image
Returns:
Preprocessed PIL Image
"""
try:
from PIL import ImageEnhance, ImageFilter
# Convert to grayscale
if image.mode != 'L':
image = image.convert('L')
# Enhance contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2.0)
# Denoise
image = image.filter(ImageFilter.MedianFilter(size=3))
# Convert back to RGB (required by model)
image = image.convert('RGB')
return image
except Exception as e:
logger.warning(f"Error preprocessing image: {e}")
return image
def detect_text_lines(self, image: Image.Image) -> List[Dict[str, Any]]:
"""
Detect individual text lines in an image.
Args:
image: PIL Image object
Returns:
List of detected lines with bounding boxes
[
{
'bbox': [x1, y1, x2, y2],
'image': PIL.Image
},
...
]
"""
try:
import cv2
import numpy as np
# Convert PIL to OpenCV format
img_array = np.array(image)
if len(img_array.shape) == 3:
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
else:
gray = img_array
# Binarize
_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Find contours
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Get bounding boxes for each contour
lines = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
# Filter out very small regions
if w > 20 and h > 10:
# Crop line from original image
line_img = image.crop((x, y, x+w, y+h))
lines.append({
'bbox': [x, y, x+w, y+h],
'image': line_img
})
# Sort lines top to bottom
lines.sort(key=lambda l: l['bbox'][1])
logger.info(f"Detected {len(lines)} text lines")
return lines
except ImportError:
logger.error("opencv-python not installed. Install with: pip install opencv-python")
return []
except Exception as e:
logger.error(f"Error detecting text lines: {e}")
return []
def recognize_lines(
self,
image_path: str,
return_confidence: bool = True
) -> List[Dict[str, Any]]:
"""
Recognize text from each line in an image.
Args:
image_path: Path to image file
return_confidence: Whether to include confidence scores
Returns:
List of recognized lines with text and metadata
[
{
'text': 'recognized text',
'bbox': [x1, y1, x2, y2],
'confidence': 0.95
},
...
]
"""
try:
# Load image
image = Image.open(image_path).convert('RGB')
# Detect lines
lines = self.detect_text_lines(image)
# Recognize each line
results = []
for i, line in enumerate(lines):
logger.debug(f"Recognizing line {i+1}/{len(lines)}")
text = self.recognize_from_image(line['image'], preprocess=True)
result = {
'text': text,
'bbox': line['bbox'],
'line_index': i
}
if return_confidence:
# Simple confidence based on text length and content
confidence = self._estimate_confidence(text)
result['confidence'] = confidence
results.append(result)
logger.info(f"Recognized {len(results)} lines from {image_path}")
return results
except Exception as e:
logger.error(f"Error recognizing lines from {image_path}: {e}")
return []
def _estimate_confidence(self, text: str) -> float:
"""
Estimate confidence of recognition result.
Args:
text: Recognized text
Returns:
Confidence score (0-1)
"""
if not text:
return 0.0
# Factors that indicate good recognition
score = 0.5 # Base score
# Longer text tends to be more reliable
if len(text) > 10:
score += 0.1
if len(text) > 20:
score += 0.1
# Text with alphanumeric characters is more reliable
if any(c.isalnum() for c in text):
score += 0.1
# Text with spaces (words) is more reliable
if ' ' in text:
score += 0.1
# Penalize if too many special characters
special_chars = sum(1 for c in text if not c.isalnum() and not c.isspace())
if special_chars / len(text) > 0.5:
score -= 0.2
return max(0.0, min(1.0, score))
def recognize_from_file(
self,
image_path: str,
mode: str = 'full'
) -> Dict[str, Any]:
"""
Recognize handwriting from an image file.
Args:
image_path: Path to image file
mode: Recognition mode
- 'full': Recognize entire image as one block
- 'lines': Detect and recognize individual lines
Returns:
Dictionary with recognized text and metadata
"""
try:
if mode == 'full':
# Recognize entire image
image = Image.open(image_path).convert('RGB')
text = self.recognize_from_image(image, preprocess=True)
return {
'text': text,
'mode': 'full',
'confidence': self._estimate_confidence(text)
}
elif mode == 'lines':
# Recognize line by line
lines = self.recognize_lines(image_path, return_confidence=True)
# Combine all lines
full_text = '\n'.join(line['text'] for line in lines)
avg_confidence = np.mean([line['confidence'] for line in lines]) if lines else 0.0
return {
'text': full_text,
'lines': lines,
'mode': 'lines',
'confidence': float(avg_confidence)
}
else:
raise ValueError(f"Invalid mode: {mode}. Use 'full' or 'lines'")
except Exception as e:
logger.error(f"Error recognizing from file {image_path}: {e}")
return {
'text': '',
'mode': mode,
'confidence': 0.0,
'error': str(e)
}
def recognize_form_fields(
self,
image_path: str,
field_regions: List[Dict[str, Any]]
) -> Dict[str, str]:
"""
Recognize text from specific form fields.
Args:
image_path: Path to form image
field_regions: List of field definitions
[
{
'name': 'field_name',
'bbox': [x1, y1, x2, y2]
},
...
]
Returns:
Dictionary mapping field names to recognized text
"""
try:
# Load image
image = Image.open(image_path).convert('RGB')
# Extract and recognize each field
results = {}
for field in field_regions:
name = field['name']
bbox = field['bbox']
# Crop field region
x1, y1, x2, y2 = bbox
field_image = image.crop((x1, y1, x2, y2))
# Recognize text
text = self.recognize_from_image(field_image, preprocess=True)
results[name] = text.strip()
logger.debug(f"Field '{name}': {text[:50]}...")
return results
except Exception as e:
logger.error(f"Error recognizing form fields: {e}")
return {}
def batch_recognize(
self,
image_paths: List[str],
mode: str = 'full'
) -> List[Dict[str, Any]]:
"""
Recognize handwriting from multiple images in batch.
Args:
image_paths: List of image file paths
mode: Recognition mode ('full' or 'lines')
Returns:
List of recognition results
"""
results = []
for i, path in enumerate(image_paths):
logger.info(f"Processing image {i+1}/{len(image_paths)}: {path}")
result = self.recognize_from_file(path, mode=mode)
result['image_path'] = path
results.append(result)
return results

View file

@ -0,0 +1,414 @@
"""
Table detection and extraction from documents.
This module uses various techniques to detect and extract tables from documents:
1. Image-based detection using deep learning (table-transformer)
2. PDF structure analysis
3. OCR-based table detection
"""
import logging
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
from PIL import Image
logger = logging.getLogger(__name__)
class TableExtractor:
"""
Extract tables from document images and PDFs.
Supports multiple extraction methods:
- Deep learning-based table detection (table-transformer model)
- PDF structure parsing
- OCR-based table extraction
Example:
>>> extractor = TableExtractor()
>>> tables = extractor.extract_tables_from_image("invoice.png")
>>> for table in tables:
... print(table['data']) # pandas DataFrame
... print(table['bbox']) # bounding box coordinates
"""
def __init__(
self,
model_name: str = "microsoft/table-transformer-detection",
confidence_threshold: float = 0.7,
use_gpu: bool = True,
):
"""
Initialize the table extractor.
Args:
model_name: Hugging Face model name for table detection
confidence_threshold: Minimum confidence score for detection (0-1)
use_gpu: Whether to use GPU acceleration if available
"""
self.model_name = model_name
self.confidence_threshold = confidence_threshold
self.use_gpu = use_gpu
self._model = None
self._processor = None
def _load_model(self):
"""Lazy load the table detection model."""
if self._model is not None:
return
try:
from transformers import AutoImageProcessor, AutoModelForObjectDetection
import torch
logger.info(f"Loading table detection model: {self.model_name}")
self._processor = AutoImageProcessor.from_pretrained(self.model_name)
self._model = AutoModelForObjectDetection.from_pretrained(self.model_name)
# Move to GPU if available and requested
if self.use_gpu and torch.cuda.is_available():
self._model = self._model.cuda()
logger.info("Using GPU for table detection")
else:
logger.info("Using CPU for table detection")
except ImportError as e:
logger.error(f"Failed to load table detection model: {e}")
logger.error("Please install required packages: pip install transformers torch pillow")
raise
def detect_tables(self, image: Image.Image) -> List[Dict[str, Any]]:
"""
Detect tables in an image.
Args:
image: PIL Image object
Returns:
List of detected tables with bounding boxes and confidence scores
[
{
'bbox': [x1, y1, x2, y2], # coordinates
'score': 0.95, # confidence
'label': 'table'
},
...
]
"""
self._load_model()
try:
import torch
# Prepare image
inputs = self._processor(images=image, return_tensors="pt")
if self.use_gpu and torch.cuda.is_available():
inputs = {k: v.cuda() for k, v in inputs.items()}
# Run detection
with torch.no_grad():
outputs = self._model(**inputs)
# Post-process results
target_sizes = torch.tensor([image.size[::-1]])
results = self._processor.post_process_object_detection(
outputs,
threshold=self.confidence_threshold,
target_sizes=target_sizes
)[0]
# Convert to list of dicts
tables = []
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
tables.append({
'bbox': box.cpu().tolist(),
'score': score.item(),
'label': self._model.config.id2label[label.item()]
})
logger.info(f"Detected {len(tables)} tables in image")
return tables
except Exception as e:
logger.error(f"Error detecting tables: {e}")
return []
def extract_table_from_region(
self,
image: Image.Image,
bbox: List[float],
use_ocr: bool = True
) -> Optional[Dict[str, Any]]:
"""
Extract table data from a specific region of an image.
Args:
image: PIL Image object
bbox: Bounding box [x1, y1, x2, y2]
use_ocr: Whether to use OCR for text extraction
Returns:
Extracted table data as dictionary with 'data' (pandas DataFrame)
and 'raw_text' keys, or None if extraction failed
"""
try:
# Crop to table region
x1, y1, x2, y2 = [int(coord) for coord in bbox]
table_image = image.crop((x1, y1, x2, y2))
if use_ocr:
# Use OCR to extract text and structure
import pytesseract
# Get detailed OCR data
ocr_data = pytesseract.image_to_data(
table_image,
output_type=pytesseract.Output.DICT
)
# Reconstruct table structure from OCR data
table_data = self._reconstruct_table_from_ocr(ocr_data)
# Also get raw text
raw_text = pytesseract.image_to_string(table_image)
return {
'data': table_data,
'raw_text': raw_text,
'bbox': bbox,
'image_size': table_image.size
}
else:
# Fallback to basic OCR without structure
import pytesseract
raw_text = pytesseract.image_to_string(table_image)
return {
'data': None,
'raw_text': raw_text,
'bbox': bbox,
'image_size': table_image.size
}
except ImportError:
logger.error("pytesseract not installed. Install with: pip install pytesseract")
return None
except Exception as e:
logger.error(f"Error extracting table from region: {e}")
return None
def _reconstruct_table_from_ocr(self, ocr_data: Dict) -> Optional[Any]:
"""
Reconstruct table structure from OCR output.
Args:
ocr_data: OCR data from pytesseract
Returns:
pandas DataFrame or None if reconstruction failed
"""
try:
import pandas as pd
# Group text by vertical position (rows)
rows = {}
for i, text in enumerate(ocr_data['text']):
if text.strip():
top = ocr_data['top'][i]
left = ocr_data['left'][i]
# Group by approximate row (within 20 pixels)
row_key = round(top / 20) * 20
if row_key not in rows:
rows[row_key] = []
rows[row_key].append((left, text))
# Sort rows and create DataFrame
table_rows = []
for row_y in sorted(rows.keys()):
# Sort cells by horizontal position
cells = [text for _, text in sorted(rows[row_y])]
table_rows.append(cells)
if table_rows:
# Pad rows to same length
max_cols = max(len(row) for row in table_rows)
table_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
# Create DataFrame
df = pd.DataFrame(table_rows)
# Try to use first row as header if it looks like one
if len(df) > 1:
first_row_text = ' '.join(str(x) for x in df.iloc[0])
if not any(char.isdigit() for char in first_row_text):
df.columns = df.iloc[0]
df = df[1:].reset_index(drop=True)
return df
return None
except ImportError:
logger.error("pandas not installed. Install with: pip install pandas")
return None
except Exception as e:
logger.error(f"Error reconstructing table: {e}")
return None
def extract_tables_from_image(
self,
image_path: str,
output_format: str = 'dataframe'
) -> List[Dict[str, Any]]:
"""
Extract all tables from an image file.
Args:
image_path: Path to image file
output_format: 'dataframe' or 'csv' or 'json'
Returns:
List of extracted tables with data and metadata
"""
try:
# Load image
image = Image.open(image_path).convert('RGB')
# Detect tables
detections = self.detect_tables(image)
# Extract data from each table
tables = []
for i, detection in enumerate(detections):
logger.info(f"Extracting table {i+1}/{len(detections)}")
table_data = self.extract_table_from_region(
image,
detection['bbox']
)
if table_data:
table_data['detection_score'] = detection['score']
table_data['table_index'] = i
# Convert to requested format
if output_format == 'csv' and table_data['data'] is not None:
table_data['csv'] = table_data['data'].to_csv(index=False)
elif output_format == 'json' and table_data['data'] is not None:
table_data['json'] = table_data['data'].to_json(orient='records')
tables.append(table_data)
logger.info(f"Successfully extracted {len(tables)} tables from {image_path}")
return tables
except Exception as e:
logger.error(f"Error extracting tables from image {image_path}: {e}")
return []
def extract_tables_from_pdf(
self,
pdf_path: str,
page_numbers: Optional[List[int]] = None
) -> Dict[int, List[Dict[str, Any]]]:
"""
Extract tables from a PDF document.
Args:
pdf_path: Path to PDF file
page_numbers: List of page numbers to process (1-indexed), or None for all pages
Returns:
Dictionary mapping page numbers to lists of extracted tables
"""
try:
from pdf2image import convert_from_path
logger.info(f"Converting PDF to images: {pdf_path}")
# Convert PDF pages to images
if page_numbers:
images = convert_from_path(
pdf_path,
first_page=min(page_numbers),
last_page=max(page_numbers)
)
else:
images = convert_from_path(pdf_path)
# Extract tables from each page
results = {}
for i, image in enumerate(images):
page_num = page_numbers[i] if page_numbers else i + 1
logger.info(f"Processing page {page_num}")
# Detect and extract tables
detections = self.detect_tables(image)
tables = []
for detection in detections:
table_data = self.extract_table_from_region(
image,
detection['bbox']
)
if table_data:
table_data['detection_score'] = detection['score']
table_data['page'] = page_num
tables.append(table_data)
if tables:
results[page_num] = tables
logger.info(f"Found {len(tables)} tables on page {page_num}")
return results
except ImportError:
logger.error("pdf2image not installed. Install with: pip install pdf2image")
return {}
except Exception as e:
logger.error(f"Error extracting tables from PDF: {e}")
return {}
def save_tables_to_excel(
self,
tables: List[Dict[str, Any]],
output_path: str
) -> bool:
"""
Save extracted tables to an Excel file.
Args:
tables: List of table dictionaries with 'data' key containing DataFrame
output_path: Path to output Excel file
Returns:
True if successful, False otherwise
"""
try:
import pandas as pd
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
for i, table in enumerate(tables):
if table.get('data') is not None:
sheet_name = f"Table_{i+1}"
if 'page' in table:
sheet_name = f"Page_{table['page']}_Table_{i+1}"
table['data'].to_excel(
writer,
sheet_name=sheet_name,
index=False
)
logger.info(f"Saved {len(tables)} tables to {output_path}")
return True
except ImportError:
logger.error("openpyxl not installed. Install with: pip install openpyxl")
return False
except Exception as e:
logger.error(f"Error saving tables to Excel: {e}")
return False