Implement Phase 4 advanced OCR: table extraction, handwriting recognition, and form detection

Co-authored-by: dawnsystem <42047891+dawnsystem@users.noreply.github.com>
2025-12-20 05:26:53 +01:00 · 2025-11-09 17:49:14 +00:00 · 2025-11-09 17:49:14 +00:00 · 02d3962877
commit 02d3962877
parent e33974f8f7
6 changed files with 2513 additions and 0 deletions
--- a/src/documents/ocr/init.py
+++ b/src/documents/ocr/init.py
@ -0,0 +1,31 @@
+"""
+Advanced OCR module for IntelliDocs-ngx.
+
+This module provides enhanced OCR capabilities including:
+- Table detection and extraction
+- Handwriting recognition
+- Form field detection
+- Layout analysis
+
+Lazy imports are used to avoid loading heavy dependencies unless needed.
+"""
+
+__all__ = [
+    'TableExtractor',
+    'HandwritingRecognizer',
+    'FormFieldDetector',
+]
+
+
+def __getattr__(name):
+    """Lazy import to avoid loading heavy ML models on startup."""
+    if name == 'TableExtractor':
+        from .table_extractor import TableExtractor
+        return TableExtractor
+    elif name == 'HandwritingRecognizer':
+        from .handwriting import HandwritingRecognizer
+        return HandwritingRecognizer
+    elif name == 'FormFieldDetector':
+        from .form_detector import FormFieldDetector
+        return FormFieldDetector
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
--- a/src/documents/ocr/form_detector.py
+++ b/src/documents/ocr/form_detector.py
@ -0,0 +1,493 @@
+"""
+Form field detection and recognition.
+
+This module provides capabilities to:
+1. Detect form fields (checkboxes, text fields, labels)
+2. Extract field values
+3. Map fields to structured data
+"""
+
+import logging
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+import numpy as np
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+class FormFieldDetector:
+    """
+    Detect and extract form fields from document images.
+    
+    Supports:
+    - Text field detection
+    - Checkbox detection and state recognition
+    - Label association
+    - Value extraction
+    
+    Example:
+        >>> detector = FormFieldDetector()
+        >>> fields = detector.detect_form_fields("form.jpg")
+        >>> for field in fields:
+        ...     print(f"{field['label']}: {field['value']}")
+        
+        >>> # Extract specific field types
+        >>> checkboxes = detector.detect_checkboxes("form.jpg")
+        >>> for cb in checkboxes:
+        ...     print(f"{cb['label']}: {'✓' if cb['checked'] else '☐'}")
+    """
+    
+    def __init__(self, use_gpu: bool = True):
+        """
+        Initialize the form field detector.
+        
+        Args:
+            use_gpu: Whether to use GPU acceleration if available
+        """
+        self.use_gpu = use_gpu
+        self._handwriting_recognizer = None
+    
+    def _get_handwriting_recognizer(self):
+        """Lazy load handwriting recognizer for field value extraction."""
+        if self._handwriting_recognizer is None:
+            from .handwriting import HandwritingRecognizer
+            self._handwriting_recognizer = HandwritingRecognizer(use_gpu=self.use_gpu)
+        return self._handwriting_recognizer
+    
+    def detect_checkboxes(
+        self, 
+        image: Image.Image,
+        min_size: int = 10,
+        max_size: int = 50
+    ) -> List[Dict[str, Any]]:
+        """
+        Detect checkboxes in a form image.
+        
+        Args:
+            image: PIL Image object
+            min_size: Minimum checkbox size in pixels
+            max_size: Maximum checkbox size in pixels
+            
+        Returns:
+            List of detected checkboxes with state
+            [
+                {
+                    'bbox': [x1, y1, x2, y2],
+                    'checked': True/False,
+                    'confidence': 0.95
+                },
+                ...
+            ]
+        """
+        try:
+            import cv2
+            
+            # Convert to OpenCV format
+            img_array = np.array(image)
+            if len(img_array.shape) == 3:
+                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            else:
+                gray = img_array
+            
+            # Detect edges
+            edges = cv2.Canny(gray, 50, 150)
+            
+            # Find contours
+            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            
+            checkboxes = []
+            for contour in contours:
+                # Get bounding box
+                x, y, w, h = cv2.boundingRect(contour)
+                
+                # Check if it looks like a checkbox (square-ish, right size)
+                aspect_ratio = w / h if h > 0 else 0
+                if (min_size <= w <= max_size and 
+                    min_size <= h <= max_size and 
+                    0.7 <= aspect_ratio <= 1.3):
+                    
+                    # Extract checkbox region
+                    checkbox_region = gray[y:y+h, x:x+w]
+                    
+                    # Determine if checked (look for marks inside)
+                    checked, confidence = self._is_checkbox_checked(checkbox_region)
+                    
+                    checkboxes.append({
+                        'bbox': [x, y, x+w, y+h],
+                        'checked': checked,
+                        'confidence': confidence
+                    })
+            
+            logger.info(f"Detected {len(checkboxes)} checkboxes")
+            return checkboxes
+            
+        except ImportError:
+            logger.error("opencv-python not installed. Install with: pip install opencv-python")
+            return []
+        except Exception as e:
+            logger.error(f"Error detecting checkboxes: {e}")
+            return []
+    
+    def _is_checkbox_checked(self, checkbox_image: np.ndarray) -> Tuple[bool, float]:
+        """
+        Determine if a checkbox is checked.
+        
+        Args:
+            checkbox_image: Grayscale image of checkbox
+            
+        Returns:
+            Tuple of (is_checked, confidence)
+        """
+        try:
+            import cv2
+            
+            # Binarize
+            _, binary = cv2.threshold(checkbox_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+            
+            # Count dark pixels in the center region (where mark would be)
+            h, w = binary.shape
+            center_region = binary[int(h*0.2):int(h*0.8), int(w*0.2):int(w*0.8)]
+            
+            if center_region.size == 0:
+                return False, 0.0
+            
+            dark_pixel_ratio = np.sum(center_region > 0) / center_region.size
+            
+            # If more than 15% of center is dark, consider it checked
+            checked = dark_pixel_ratio > 0.15
+            confidence = min(dark_pixel_ratio * 2, 1.0)  # Scale confidence
+            
+            return checked, confidence
+            
+        except Exception as e:
+            logger.warning(f"Error checking checkbox state: {e}")
+            return False, 0.0
+    
+    def detect_text_fields(
+        self, 
+        image: Image.Image,
+        min_width: int = 100
+    ) -> List[Dict[str, Any]]:
+        """
+        Detect text input fields in a form.
+        
+        Args:
+            image: PIL Image object
+            min_width: Minimum field width in pixels
+            
+        Returns:
+            List of detected text fields
+            [
+                {
+                    'bbox': [x1, y1, x2, y2],
+                    'type': 'line' or 'box'
+                },
+                ...
+            ]
+        """
+        try:
+            import cv2
+            
+            # Convert to OpenCV format
+            img_array = np.array(image)
+            if len(img_array.shape) == 3:
+                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            else:
+                gray = img_array
+            
+            # Detect horizontal lines (underlines for text fields)
+            horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (min_width, 1))
+            detect_horizontal = cv2.morphologyEx(
+                cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1],
+                cv2.MORPH_OPEN,
+                horizontal_kernel,
+                iterations=2
+            )
+            
+            # Find contours of horizontal lines
+            contours, _ = cv2.findContours(
+                detect_horizontal, 
+                cv2.RETR_EXTERNAL, 
+                cv2.CHAIN_APPROX_SIMPLE
+            )
+            
+            text_fields = []
+            for contour in contours:
+                x, y, w, h = cv2.boundingRect(contour)
+                
+                # Check if it's a horizontal line (field underline)
+                if w >= min_width and h < 10:
+                    # Expand upward to include text area
+                    text_bbox = [x, max(0, y-30), x+w, y+h]
+                    text_fields.append({
+                        'bbox': text_bbox,
+                        'type': 'line'
+                    })
+            
+            # Detect rectangular boxes (bordered text fields)
+            edges = cv2.Canny(gray, 50, 150)
+            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            
+            for contour in contours:
+                x, y, w, h = cv2.boundingRect(contour)
+                
+                # Check if it's a rectangular box
+                aspect_ratio = w / h if h > 0 else 0
+                if w >= min_width and 20 <= h <= 100 and aspect_ratio > 2:
+                    text_fields.append({
+                        'bbox': [x, y, x+w, y+h],
+                        'type': 'box'
+                    })
+            
+            logger.info(f"Detected {len(text_fields)} text fields")
+            return text_fields
+            
+        except ImportError:
+            logger.error("opencv-python not installed")
+            return []
+        except Exception as e:
+            logger.error(f"Error detecting text fields: {e}")
+            return []
+    
+    def detect_labels(
+        self, 
+        image: Image.Image,
+        field_bboxes: List[List[int]]
+    ) -> List[Dict[str, Any]]:
+        """
+        Detect labels near form fields.
+        
+        Args:
+            image: PIL Image object
+            field_bboxes: List of field bounding boxes [[x1,y1,x2,y2], ...]
+            
+        Returns:
+            List of detected labels with associated field indices
+        """
+        try:
+            import pytesseract
+            
+            # Get all text with bounding boxes
+            ocr_data = pytesseract.image_to_data(
+                image, 
+                output_type=pytesseract.Output.DICT
+            )
+            
+            # Group text into potential labels
+            labels = []
+            for i, text in enumerate(ocr_data['text']):
+                if text.strip() and len(text.strip()) > 2:
+                    x = ocr_data['left'][i]
+                    y = ocr_data['top'][i]
+                    w = ocr_data['width'][i]
+                    h = ocr_data['height'][i]
+                    
+                    label_bbox = [x, y, x+w, y+h]
+                    
+                    # Find closest field
+                    closest_field_idx = self._find_closest_field(label_bbox, field_bboxes)
+                    
+                    labels.append({
+                        'text': text.strip(),
+                        'bbox': label_bbox,
+                        'field_index': closest_field_idx
+                    })
+            
+            return labels
+            
+        except ImportError:
+            logger.error("pytesseract not installed")
+            return []
+        except Exception as e:
+            logger.error(f"Error detecting labels: {e}")
+            return []
+    
+    def _find_closest_field(
+        self, 
+        label_bbox: List[int], 
+        field_bboxes: List[List[int]]
+    ) -> Optional[int]:
+        """
+        Find the closest field to a label.
+        
+        Args:
+            label_bbox: Label bounding box [x1, y1, x2, y2]
+            field_bboxes: List of field bounding boxes
+            
+        Returns:
+            Index of closest field, or None if no fields
+        """
+        if not field_bboxes:
+            return None
+        
+        # Calculate center of label
+        label_center_x = (label_bbox[0] + label_bbox[2]) / 2
+        label_center_y = (label_bbox[1] + label_bbox[3]) / 2
+        
+        min_distance = float('inf')
+        closest_idx = 0
+        
+        for i, field_bbox in enumerate(field_bboxes):
+            # Calculate center of field
+            field_center_x = (field_bbox[0] + field_bbox[2]) / 2
+            field_center_y = (field_bbox[1] + field_bbox[3]) / 2
+            
+            # Euclidean distance
+            distance = np.sqrt(
+                (label_center_x - field_center_x)**2 + 
+                (label_center_y - field_center_y)**2
+            )
+            
+            if distance < min_distance:
+                min_distance = distance
+                closest_idx = i
+        
+        return closest_idx
+    
+    def detect_form_fields(
+        self, 
+        image_path: str,
+        extract_values: bool = True
+    ) -> List[Dict[str, Any]]:
+        """
+        Detect all form fields and extract their values.
+        
+        Args:
+            image_path: Path to form image
+            extract_values: Whether to extract field values using OCR
+            
+        Returns:
+            List of detected fields with labels and values
+            [
+                {
+                    'type': 'text' or 'checkbox',
+                    'label': 'Field Label',
+                    'value': 'field value' or True/False,
+                    'bbox': [x1, y1, x2, y2],
+                    'confidence': 0.95
+                },
+                ...
+            ]
+        """
+        try:
+            # Load image
+            image = Image.open(image_path).convert('RGB')
+            
+            # Detect different field types
+            text_fields = self.detect_text_fields(image)
+            checkboxes = self.detect_checkboxes(image)
+            
+            # Combine all field bboxes for label detection
+            all_field_bboxes = [f['bbox'] for f in text_fields] + [cb['bbox'] for cb in checkboxes]
+            
+            # Detect labels
+            labels = self.detect_labels(image, all_field_bboxes)
+            
+            # Build results
+            results = []
+            
+            # Add text fields
+            for i, field in enumerate(text_fields):
+                # Find associated label
+                label_text = self._find_label_for_field(i, labels, len(text_fields))
+                
+                result = {
+                    'type': 'text',
+                    'label': label_text,
+                    'bbox': field['bbox'],
+                }
+                
+                # Extract value if requested
+                if extract_values:
+                    x1, y1, x2, y2 = field['bbox']
+                    field_image = image.crop((x1, y1, x2, y2))
+                    
+                    recognizer = self._get_handwriting_recognizer()
+                    value = recognizer.recognize_from_image(field_image, preprocess=True)
+                    result['value'] = value.strip()
+                    result['confidence'] = recognizer._estimate_confidence(value)
+                
+                results.append(result)
+            
+            # Add checkboxes
+            for i, checkbox in enumerate(checkboxes):
+                field_idx = len(text_fields) + i
+                label_text = self._find_label_for_field(field_idx, labels, len(all_field_bboxes))
+                
+                results.append({
+                    'type': 'checkbox',
+                    'label': label_text,
+                    'value': checkbox['checked'],
+                    'bbox': checkbox['bbox'],
+                    'confidence': checkbox['confidence']
+                })
+            
+            logger.info(f"Detected {len(results)} form fields from {image_path}")
+            return results
+            
+        except Exception as e:
+            logger.error(f"Error detecting form fields: {e}")
+            return []
+    
+    def _find_label_for_field(
+        self, 
+        field_idx: int, 
+        labels: List[Dict[str, Any]],
+        total_fields: int
+    ) -> str:
+        """
+        Find the label text for a specific field.
+        
+        Args:
+            field_idx: Index of the field
+            labels: List of detected labels
+            total_fields: Total number of fields
+            
+        Returns:
+            Label text or empty string if not found
+        """
+        matching_labels = [
+            label for label in labels 
+            if label['field_index'] == field_idx
+        ]
+        
+        if matching_labels:
+            # Combine multiple label parts if found
+            return ' '.join(label['text'] for label in matching_labels)
+        
+        return f"Field_{field_idx + 1}"
+    
+    def extract_form_data(
+        self, 
+        image_path: str,
+        output_format: str = 'dict'
+    ) -> Any:
+        """
+        Extract all form data as structured output.
+        
+        Args:
+            image_path: Path to form image
+            output_format: Output format ('dict', 'json', or 'dataframe')
+            
+        Returns:
+            Structured form data in requested format
+        """
+        # Detect and extract fields
+        fields = self.detect_form_fields(image_path, extract_values=True)
+        
+        if output_format == 'dict':
+            # Return as dictionary
+            return {field['label']: field['value'] for field in fields}
+        
+        elif output_format == 'json':
+            import json
+            data = {field['label']: field['value'] for field in fields}
+            return json.dumps(data, indent=2)
+        
+        elif output_format == 'dataframe':
+            import pandas as pd
+            return pd.DataFrame(fields)
+        
+        else:
+            raise ValueError(f"Invalid output format: {output_format}")
--- a/src/documents/ocr/handwriting.py
+++ b/src/documents/ocr/handwriting.py
@ -0,0 +1,448 @@
+"""
+Handwriting recognition for documents.
+
+This module provides handwriting OCR capabilities using:
+1. TrOCR (Transformer-based OCR) for printed and handwritten text
+2. Custom models fine-tuned for specific handwriting styles
+3. Confidence scoring for recognition quality
+"""
+
+import logging
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+import numpy as np
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+class HandwritingRecognizer:
+    """
+    Recognize handwritten text from document images.
+    
+    Uses transformer-based models (TrOCR) for accurate handwriting recognition.
+    Supports both printed and handwritten text detection.
+    
+    Example:
+        >>> recognizer = HandwritingRecognizer()
+        >>> text = recognizer.recognize_from_image("handwritten_note.jpg")
+        >>> print(text)
+        "This is handwritten text..."
+        
+        >>> # With line detection
+        >>> lines = recognizer.recognize_lines("form.jpg")
+        >>> for line in lines:
+        ...     print(f"{line['text']} (confidence: {line['confidence']:.2f})")
+    """
+    
+    def __init__(
+        self,
+        model_name: str = "microsoft/trocr-base-handwritten",
+        use_gpu: bool = True,
+        confidence_threshold: float = 0.5,
+    ):
+        """
+        Initialize the handwriting recognizer.
+        
+        Args:
+            model_name: Hugging Face model name
+                Options:
+                - "microsoft/trocr-base-handwritten" (default, good for English)
+                - "microsoft/trocr-large-handwritten" (more accurate, slower)
+                - "microsoft/trocr-base-printed" (for printed text)
+            use_gpu: Whether to use GPU acceleration if available
+            confidence_threshold: Minimum confidence for accepting recognition
+        """
+        self.model_name = model_name
+        self.use_gpu = use_gpu
+        self.confidence_threshold = confidence_threshold
+        self._model = None
+        self._processor = None
+        
+    def _load_model(self):
+        """Lazy load the handwriting recognition model."""
+        if self._model is not None:
+            return
+            
+        try:
+            from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+            import torch
+            
+            logger.info(f"Loading handwriting recognition model: {self.model_name}")
+            
+            self._processor = TrOCRProcessor.from_pretrained(self.model_name)
+            self._model = VisionEncoderDecoderModel.from_pretrained(self.model_name)
+            
+            # Move to GPU if available and requested
+            if self.use_gpu and torch.cuda.is_available():
+                self._model = self._model.cuda()
+                logger.info("Using GPU for handwriting recognition")
+            else:
+                logger.info("Using CPU for handwriting recognition")
+                
+            self._model.eval()  # Set to evaluation mode
+            
+        except ImportError as e:
+            logger.error(f"Failed to load handwriting model: {e}")
+            logger.error("Please install: pip install transformers torch pillow")
+            raise
+    
+    def recognize_from_image(
+        self, 
+        image: Image.Image,
+        preprocess: bool = True
+    ) -> str:
+        """
+        Recognize text from a single image.
+        
+        Args:
+            image: PIL Image object containing handwritten text
+            preprocess: Whether to preprocess image (contrast, binarization)
+            
+        Returns:
+            Recognized text string
+        """
+        self._load_model()
+        
+        try:
+            import torch
+            
+            # Preprocess image if requested
+            if preprocess:
+                image = self._preprocess_image(image)
+            
+            # Prepare image for model
+            pixel_values = self._processor(images=image, return_tensors="pt").pixel_values
+            
+            if self.use_gpu and torch.cuda.is_available():
+                pixel_values = pixel_values.cuda()
+            
+            # Generate text
+            with torch.no_grad():
+                generated_ids = self._model.generate(pixel_values)
+            
+            # Decode to text
+            text = self._processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            
+            logger.debug(f"Recognized text: {text[:100]}...")
+            return text
+            
+        except Exception as e:
+            logger.error(f"Error recognizing handwriting: {e}")
+            return ""
+    
+    def _preprocess_image(self, image: Image.Image) -> Image.Image:
+        """
+        Preprocess image for better recognition.
+        
+        Args:
+            image: Input PIL Image
+            
+        Returns:
+            Preprocessed PIL Image
+        """
+        try:
+            from PIL import ImageEnhance, ImageFilter
+            
+            # Convert to grayscale
+            if image.mode != 'L':
+                image = image.convert('L')
+            
+            # Enhance contrast
+            enhancer = ImageEnhance.Contrast(image)
+            image = enhancer.enhance(2.0)
+            
+            # Denoise
+            image = image.filter(ImageFilter.MedianFilter(size=3))
+            
+            # Convert back to RGB (required by model)
+            image = image.convert('RGB')
+            
+            return image
+            
+        except Exception as e:
+            logger.warning(f"Error preprocessing image: {e}")
+            return image
+    
+    def detect_text_lines(self, image: Image.Image) -> List[Dict[str, Any]]:
+        """
+        Detect individual text lines in an image.
+        
+        Args:
+            image: PIL Image object
+            
+        Returns:
+            List of detected lines with bounding boxes
+            [
+                {
+                    'bbox': [x1, y1, x2, y2],
+                    'image': PIL.Image
+                },
+                ...
+            ]
+        """
+        try:
+            import cv2
+            import numpy as np
+            
+            # Convert PIL to OpenCV format
+            img_array = np.array(image)
+            if len(img_array.shape) == 3:
+                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+            else:
+                gray = img_array
+            
+            # Binarize
+            _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+            
+            # Find contours
+            contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            
+            # Get bounding boxes for each contour
+            lines = []
+            for contour in contours:
+                x, y, w, h = cv2.boundingRect(contour)
+                
+                # Filter out very small regions
+                if w > 20 and h > 10:
+                    # Crop line from original image
+                    line_img = image.crop((x, y, x+w, y+h))
+                    lines.append({
+                        'bbox': [x, y, x+w, y+h],
+                        'image': line_img
+                    })
+            
+            # Sort lines top to bottom
+            lines.sort(key=lambda l: l['bbox'][1])
+            
+            logger.info(f"Detected {len(lines)} text lines")
+            return lines
+            
+        except ImportError:
+            logger.error("opencv-python not installed. Install with: pip install opencv-python")
+            return []
+        except Exception as e:
+            logger.error(f"Error detecting text lines: {e}")
+            return []
+    
+    def recognize_lines(
+        self, 
+        image_path: str,
+        return_confidence: bool = True
+    ) -> List[Dict[str, Any]]:
+        """
+        Recognize text from each line in an image.
+        
+        Args:
+            image_path: Path to image file
+            return_confidence: Whether to include confidence scores
+            
+        Returns:
+            List of recognized lines with text and metadata
+            [
+                {
+                    'text': 'recognized text',
+                    'bbox': [x1, y1, x2, y2],
+                    'confidence': 0.95
+                },
+                ...
+            ]
+        """
+        try:
+            # Load image
+            image = Image.open(image_path).convert('RGB')
+            
+            # Detect lines
+            lines = self.detect_text_lines(image)
+            
+            # Recognize each line
+            results = []
+            for i, line in enumerate(lines):
+                logger.debug(f"Recognizing line {i+1}/{len(lines)}")
+                
+                text = self.recognize_from_image(line['image'], preprocess=True)
+                
+                result = {
+                    'text': text,
+                    'bbox': line['bbox'],
+                    'line_index': i
+                }
+                
+                if return_confidence:
+                    # Simple confidence based on text length and content
+                    confidence = self._estimate_confidence(text)
+                    result['confidence'] = confidence
+                
+                results.append(result)
+            
+            logger.info(f"Recognized {len(results)} lines from {image_path}")
+            return results
+            
+        except Exception as e:
+            logger.error(f"Error recognizing lines from {image_path}: {e}")
+            return []
+    
+    def _estimate_confidence(self, text: str) -> float:
+        """
+        Estimate confidence of recognition result.
+        
+        Args:
+            text: Recognized text
+            
+        Returns:
+            Confidence score (0-1)
+        """
+        if not text:
+            return 0.0
+        
+        # Factors that indicate good recognition
+        score = 0.5  # Base score
+        
+        # Longer text tends to be more reliable
+        if len(text) > 10:
+            score += 0.1
+        if len(text) > 20:
+            score += 0.1
+        
+        # Text with alphanumeric characters is more reliable
+        if any(c.isalnum() for c in text):
+            score += 0.1
+        
+        # Text with spaces (words) is more reliable
+        if ' ' in text:
+            score += 0.1
+        
+        # Penalize if too many special characters
+        special_chars = sum(1 for c in text if not c.isalnum() and not c.isspace())
+        if special_chars / len(text) > 0.5:
+            score -= 0.2
+        
+        return max(0.0, min(1.0, score))
+    
+    def recognize_from_file(
+        self, 
+        image_path: str,
+        mode: str = 'full'
+    ) -> Dict[str, Any]:
+        """
+        Recognize handwriting from an image file.
+        
+        Args:
+            image_path: Path to image file
+            mode: Recognition mode
+                - 'full': Recognize entire image as one block
+                - 'lines': Detect and recognize individual lines
+                
+        Returns:
+            Dictionary with recognized text and metadata
+        """
+        try:
+            if mode == 'full':
+                # Recognize entire image
+                image = Image.open(image_path).convert('RGB')
+                text = self.recognize_from_image(image, preprocess=True)
+                
+                return {
+                    'text': text,
+                    'mode': 'full',
+                    'confidence': self._estimate_confidence(text)
+                }
+            
+            elif mode == 'lines':
+                # Recognize line by line
+                lines = self.recognize_lines(image_path, return_confidence=True)
+                
+                # Combine all lines
+                full_text = '\n'.join(line['text'] for line in lines)
+                avg_confidence = np.mean([line['confidence'] for line in lines]) if lines else 0.0
+                
+                return {
+                    'text': full_text,
+                    'lines': lines,
+                    'mode': 'lines',
+                    'confidence': float(avg_confidence)
+                }
+            
+            else:
+                raise ValueError(f"Invalid mode: {mode}. Use 'full' or 'lines'")
+                
+        except Exception as e:
+            logger.error(f"Error recognizing from file {image_path}: {e}")
+            return {
+                'text': '',
+                'mode': mode,
+                'confidence': 0.0,
+                'error': str(e)
+            }
+    
+    def recognize_form_fields(
+        self, 
+        image_path: str,
+        field_regions: List[Dict[str, Any]]
+    ) -> Dict[str, str]:
+        """
+        Recognize text from specific form fields.
+        
+        Args:
+            image_path: Path to form image
+            field_regions: List of field definitions
+                [
+                    {
+                        'name': 'field_name',
+                        'bbox': [x1, y1, x2, y2]
+                    },
+                    ...
+                ]
+                
+        Returns:
+            Dictionary mapping field names to recognized text
+        """
+        try:
+            # Load image
+            image = Image.open(image_path).convert('RGB')
+            
+            # Extract and recognize each field
+            results = {}
+            for field in field_regions:
+                name = field['name']
+                bbox = field['bbox']
+                
+                # Crop field region
+                x1, y1, x2, y2 = bbox
+                field_image = image.crop((x1, y1, x2, y2))
+                
+                # Recognize text
+                text = self.recognize_from_image(field_image, preprocess=True)
+                results[name] = text.strip()
+                
+                logger.debug(f"Field '{name}': {text[:50]}...")
+            
+            return results
+            
+        except Exception as e:
+            logger.error(f"Error recognizing form fields: {e}")
+            return {}
+    
+    def batch_recognize(
+        self, 
+        image_paths: List[str],
+        mode: str = 'full'
+    ) -> List[Dict[str, Any]]:
+        """
+        Recognize handwriting from multiple images in batch.
+        
+        Args:
+            image_paths: List of image file paths
+            mode: Recognition mode ('full' or 'lines')
+            
+        Returns:
+            List of recognition results
+        """
+        results = []
+        for i, path in enumerate(image_paths):
+            logger.info(f"Processing image {i+1}/{len(image_paths)}: {path}")
+            result = self.recognize_from_file(path, mode=mode)
+            result['image_path'] = path
+            results.append(result)
+        
+        return results
--- a/src/documents/ocr/table_extractor.py
+++ b/src/documents/ocr/table_extractor.py
@ -0,0 +1,414 @@
+"""
+Table detection and extraction from documents.
+
+This module uses various techniques to detect and extract tables from documents:
+1. Image-based detection using deep learning (table-transformer)
+2. PDF structure analysis
+3. OCR-based table detection
+"""
+
+import logging
+from pathlib import Path
+from typing import List, Dict, Any, Optional, Tuple
+import numpy as np
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+class TableExtractor:
+    """
+    Extract tables from document images and PDFs.
+    
+    Supports multiple extraction methods:
+    - Deep learning-based table detection (table-transformer model)
+    - PDF structure parsing
+    - OCR-based table extraction
+    
+    Example:
+        >>> extractor = TableExtractor()
+        >>> tables = extractor.extract_tables_from_image("invoice.png")
+        >>> for table in tables:
+        ...     print(table['data'])  # pandas DataFrame
+        ...     print(table['bbox'])  # bounding box coordinates
+    """
+    
+    def __init__(
+        self,
+        model_name: str = "microsoft/table-transformer-detection",
+        confidence_threshold: float = 0.7,
+        use_gpu: bool = True,
+    ):
+        """
+        Initialize the table extractor.
+        
+        Args:
+            model_name: Hugging Face model name for table detection
+            confidence_threshold: Minimum confidence score for detection (0-1)
+            use_gpu: Whether to use GPU acceleration if available
+        """
+        self.model_name = model_name
+        self.confidence_threshold = confidence_threshold
+        self.use_gpu = use_gpu
+        self._model = None
+        self._processor = None
+        
+    def _load_model(self):
+        """Lazy load the table detection model."""
+        if self._model is not None:
+            return
+            
+        try:
+            from transformers import AutoImageProcessor, AutoModelForObjectDetection
+            import torch
+            
+            logger.info(f"Loading table detection model: {self.model_name}")
+            
+            self._processor = AutoImageProcessor.from_pretrained(self.model_name)
+            self._model = AutoModelForObjectDetection.from_pretrained(self.model_name)
+            
+            # Move to GPU if available and requested
+            if self.use_gpu and torch.cuda.is_available():
+                self._model = self._model.cuda()
+                logger.info("Using GPU for table detection")
+            else:
+                logger.info("Using CPU for table detection")
+                
+        except ImportError as e:
+            logger.error(f"Failed to load table detection model: {e}")
+            logger.error("Please install required packages: pip install transformers torch pillow")
+            raise
+    
+    def detect_tables(self, image: Image.Image) -> List[Dict[str, Any]]:
+        """
+        Detect tables in an image.
+        
+        Args:
+            image: PIL Image object
+            
+        Returns:
+            List of detected tables with bounding boxes and confidence scores
+            [
+                {
+                    'bbox': [x1, y1, x2, y2],  # coordinates
+                    'score': 0.95,              # confidence
+                    'label': 'table'
+                },
+                ...
+            ]
+        """
+        self._load_model()
+        
+        try:
+            import torch
+            
+            # Prepare image
+            inputs = self._processor(images=image, return_tensors="pt")
+            
+            if self.use_gpu and torch.cuda.is_available():
+                inputs = {k: v.cuda() for k, v in inputs.items()}
+            
+            # Run detection
+            with torch.no_grad():
+                outputs = self._model(**inputs)
+            
+            # Post-process results
+            target_sizes = torch.tensor([image.size[::-1]])
+            results = self._processor.post_process_object_detection(
+                outputs, 
+                threshold=self.confidence_threshold,
+                target_sizes=target_sizes
+            )[0]
+            
+            # Convert to list of dicts
+            tables = []
+            for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+                tables.append({
+                    'bbox': box.cpu().tolist(),
+                    'score': score.item(),
+                    'label': self._model.config.id2label[label.item()]
+                })
+            
+            logger.info(f"Detected {len(tables)} tables in image")
+            return tables
+            
+        except Exception as e:
+            logger.error(f"Error detecting tables: {e}")
+            return []
+    
+    def extract_table_from_region(
+        self, 
+        image: Image.Image, 
+        bbox: List[float],
+        use_ocr: bool = True
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Extract table data from a specific region of an image.
+        
+        Args:
+            image: PIL Image object
+            bbox: Bounding box [x1, y1, x2, y2]
+            use_ocr: Whether to use OCR for text extraction
+            
+        Returns:
+            Extracted table data as dictionary with 'data' (pandas DataFrame) 
+            and 'raw_text' keys, or None if extraction failed
+        """
+        try:
+            # Crop to table region
+            x1, y1, x2, y2 = [int(coord) for coord in bbox]
+            table_image = image.crop((x1, y1, x2, y2))
+            
+            if use_ocr:
+                # Use OCR to extract text and structure
+                import pytesseract
+                
+                # Get detailed OCR data
+                ocr_data = pytesseract.image_to_data(
+                    table_image, 
+                    output_type=pytesseract.Output.DICT
+                )
+                
+                # Reconstruct table structure from OCR data
+                table_data = self._reconstruct_table_from_ocr(ocr_data)
+                
+                # Also get raw text
+                raw_text = pytesseract.image_to_string(table_image)
+                
+                return {
+                    'data': table_data,
+                    'raw_text': raw_text,
+                    'bbox': bbox,
+                    'image_size': table_image.size
+                }
+            else:
+                # Fallback to basic OCR without structure
+                import pytesseract
+                raw_text = pytesseract.image_to_string(table_image)
+                return {
+                    'data': None,
+                    'raw_text': raw_text,
+                    'bbox': bbox,
+                    'image_size': table_image.size
+                }
+                
+        except ImportError:
+            logger.error("pytesseract not installed. Install with: pip install pytesseract")
+            return None
+        except Exception as e:
+            logger.error(f"Error extracting table from region: {e}")
+            return None
+    
+    def _reconstruct_table_from_ocr(self, ocr_data: Dict) -> Optional[Any]:
+        """
+        Reconstruct table structure from OCR output.
+        
+        Args:
+            ocr_data: OCR data from pytesseract
+            
+        Returns:
+            pandas DataFrame or None if reconstruction failed
+        """
+        try:
+            import pandas as pd
+            
+            # Group text by vertical position (rows)
+            rows = {}
+            for i, text in enumerate(ocr_data['text']):
+                if text.strip():
+                    top = ocr_data['top'][i]
+                    left = ocr_data['left'][i]
+                    
+                    # Group by approximate row (within 20 pixels)
+                    row_key = round(top / 20) * 20
+                    if row_key not in rows:
+                        rows[row_key] = []
+                    rows[row_key].append((left, text))
+            
+            # Sort rows and create DataFrame
+            table_rows = []
+            for row_y in sorted(rows.keys()):
+                # Sort cells by horizontal position
+                cells = [text for _, text in sorted(rows[row_y])]
+                table_rows.append(cells)
+            
+            if table_rows:
+                # Pad rows to same length
+                max_cols = max(len(row) for row in table_rows)
+                table_rows = [row + [''] * (max_cols - len(row)) for row in table_rows]
+                
+                # Create DataFrame
+                df = pd.DataFrame(table_rows)
+                
+                # Try to use first row as header if it looks like one
+                if len(df) > 1:
+                    first_row_text = ' '.join(str(x) for x in df.iloc[0])
+                    if not any(char.isdigit() for char in first_row_text):
+                        df.columns = df.iloc[0]
+                        df = df[1:].reset_index(drop=True)
+                
+                return df
+            
+            return None
+            
+        except ImportError:
+            logger.error("pandas not installed. Install with: pip install pandas")
+            return None
+        except Exception as e:
+            logger.error(f"Error reconstructing table: {e}")
+            return None
+    
+    def extract_tables_from_image(
+        self, 
+        image_path: str,
+        output_format: str = 'dataframe'
+    ) -> List[Dict[str, Any]]:
+        """
+        Extract all tables from an image file.
+        
+        Args:
+            image_path: Path to image file
+            output_format: 'dataframe' or 'csv' or 'json'
+            
+        Returns:
+            List of extracted tables with data and metadata
+        """
+        try:
+            # Load image
+            image = Image.open(image_path).convert('RGB')
+            
+            # Detect tables
+            detections = self.detect_tables(image)
+            
+            # Extract data from each table
+            tables = []
+            for i, detection in enumerate(detections):
+                logger.info(f"Extracting table {i+1}/{len(detections)}")
+                
+                table_data = self.extract_table_from_region(
+                    image, 
+                    detection['bbox']
+                )
+                
+                if table_data:
+                    table_data['detection_score'] = detection['score']
+                    table_data['table_index'] = i
+                    
+                    # Convert to requested format
+                    if output_format == 'csv' and table_data['data'] is not None:
+                        table_data['csv'] = table_data['data'].to_csv(index=False)
+                    elif output_format == 'json' and table_data['data'] is not None:
+                        table_data['json'] = table_data['data'].to_json(orient='records')
+                    
+                    tables.append(table_data)
+            
+            logger.info(f"Successfully extracted {len(tables)} tables from {image_path}")
+            return tables
+            
+        except Exception as e:
+            logger.error(f"Error extracting tables from image {image_path}: {e}")
+            return []
+    
+    def extract_tables_from_pdf(
+        self, 
+        pdf_path: str,
+        page_numbers: Optional[List[int]] = None
+    ) -> Dict[int, List[Dict[str, Any]]]:
+        """
+        Extract tables from a PDF document.
+        
+        Args:
+            pdf_path: Path to PDF file
+            page_numbers: List of page numbers to process (1-indexed), or None for all pages
+            
+        Returns:
+            Dictionary mapping page numbers to lists of extracted tables
+        """
+        try:
+            from pdf2image import convert_from_path
+            
+            logger.info(f"Converting PDF to images: {pdf_path}")
+            
+            # Convert PDF pages to images
+            if page_numbers:
+                images = convert_from_path(
+                    pdf_path, 
+                    first_page=min(page_numbers),
+                    last_page=max(page_numbers)
+                )
+            else:
+                images = convert_from_path(pdf_path)
+            
+            # Extract tables from each page
+            results = {}
+            for i, image in enumerate(images):
+                page_num = page_numbers[i] if page_numbers else i + 1
+                logger.info(f"Processing page {page_num}")
+                
+                # Detect and extract tables
+                detections = self.detect_tables(image)
+                tables = []
+                
+                for detection in detections:
+                    table_data = self.extract_table_from_region(
+                        image, 
+                        detection['bbox']
+                    )
+                    if table_data:
+                        table_data['detection_score'] = detection['score']
+                        table_data['page'] = page_num
+                        tables.append(table_data)
+                
+                if tables:
+                    results[page_num] = tables
+                    logger.info(f"Found {len(tables)} tables on page {page_num}")
+            
+            return results
+            
+        except ImportError:
+            logger.error("pdf2image not installed. Install with: pip install pdf2image")
+            return {}
+        except Exception as e:
+            logger.error(f"Error extracting tables from PDF: {e}")
+            return {}
+    
+    def save_tables_to_excel(
+        self, 
+        tables: List[Dict[str, Any]], 
+        output_path: str
+    ) -> bool:
+        """
+        Save extracted tables to an Excel file.
+        
+        Args:
+            tables: List of table dictionaries with 'data' key containing DataFrame
+            output_path: Path to output Excel file
+            
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            import pandas as pd
+            
+            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
+                for i, table in enumerate(tables):
+                    if table.get('data') is not None:
+                        sheet_name = f"Table_{i+1}"
+                        if 'page' in table:
+                            sheet_name = f"Page_{table['page']}_Table_{i+1}"
+                        
+                        table['data'].to_excel(
+                            writer, 
+                            sheet_name=sheet_name, 
+                            index=False
+                        )
+            
+            logger.info(f"Saved {len(tables)} tables to {output_path}")
+            return True
+            
+        except ImportError:
+            logger.error("openpyxl not installed. Install with: pip install openpyxl")
+            return False
+        except Exception as e:
+            logger.error(f"Error saving tables to Excel: {e}")
+            return False