#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extract cell images from PDF pages
Usage: python extract_cells_from_pdf.py <pdf_file> <page_number> [output_folder]
"""

import cv2
import numpy as np
from pdf2image import convert_from_path
import sys
import os

def detect_grid_cells(page_image):
    """Detect 18 cells (6 rows x 3 cols) from page"""
    gray = cv2.cvtColor(page_image, cv2.COLOR_BGR2GRAY)

    # Detect horizontal lines
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (80, 1))
    horizontal_lines = cv2.morphologyEx(
        cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1],
        cv2.MORPH_OPEN, horizontal_kernel, iterations=2
    )

    # Detect vertical lines
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 80))
    vertical_lines = cv2.morphologyEx(
        cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1],
        cv2.MORPH_OPEN, vertical_kernel, iterations=2
    )

    # Combine lines
    grid = cv2.add(horizontal_lines, vertical_lines)

    # Find contours
    contours, _ = cv2.findContours(grid, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Filter and sort by area (cells should be large rectangles)
    cells = []
    page_h, page_w = page_image.shape[:2]
    min_area = (page_w * page_h) * 0.02  # At least 2% of page

    for cnt in contours:
        area = cv2.contourArea(cnt)
        if area > min_area:
            x, y, w, h = cv2.boundingRect(cnt)
            # Reasonable aspect ratio for voter cells
            aspect = float(w) / h if h > 0 else 0
            if 0.3 < aspect < 3.0:
                cells.append((x, y, w, h))

    # Sort by position (top to bottom, left to right)
    cells = sorted(cells, key=lambda c: (c[1], c[0]))

    return cells

def extract_cells_from_page(pdf_path, page_num, output_folder='cells'):
    """Extract cells from a specific page"""

    # Create output folder
    os.makedirs(output_folder, exist_ok=True)

    print(f"Converting page {page_num} from PDF...")
    pages = convert_from_path(pdf_path, dpi=300, first_page=page_num, last_page=page_num)

    if not pages:
        print(f"ERROR: Could not extract page {page_num}")
        return

    # Convert to OpenCV format
    page_pil = pages[0]
    page_np = np.array(page_pil)
    page_bgr = cv2.cvtColor(page_np, cv2.COLOR_RGB2BGR)

    print(f"Detecting cells...")
    cells = detect_grid_cells(page_bgr)
    print(f"Found {len(cells)} cells")

    # Create annotated image with boxes
    annotated = page_bgr.copy()

    # Extract and save each cell
    for idx, (x, y, w, h) in enumerate(cells, 1):
        cell_img = page_bgr[y:y+h, x:x+w]

        output_path = f"{output_folder}/cell_{idx:02d}_page{page_num}.png"
        cv2.imwrite(output_path, cell_img)
        print(f"  Saved: {output_path} ({w}x{h})")

        # Draw box and label on annotated image
        cv2.rectangle(annotated, (x, y), (x+w, y+h), (0, 255, 0), 3)
        cv2.putText(annotated, f"Cell {idx}", (x+10, y+30),
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Save annotated image
    annotated_path = f"{output_folder}/page{page_num}_annotated.png"
    cv2.imwrite(annotated_path, annotated)
    print(f"\nSaved annotated image: {annotated_path}")
    print(f"Extracted {len(cells)} cells to folder: {output_folder}/")

def main():
    if len(sys.argv) < 3:
        print("Usage: python extract_cells_from_pdf.py <pdf_file> <page_number> [output_folder]")
        print("\nExample:")
        print("  python extract_cells_from_pdf.py voter_list.pdf 6")
        print("  python extract_cells_from_pdf.py voter_list.pdf 6 page6_cells")
        sys.exit(1)

    pdf_file = sys.argv[1]
    page_num = int(sys.argv[2])
    output_folder = sys.argv[3] if len(sys.argv) > 3 else f"page{page_num}_cells"

    if not os.path.exists(pdf_file):
        print(f"ERROR: PDF file not found: {pdf_file}")
        sys.exit(1)

    extract_cells_from_page(pdf_file, page_num, output_folder)

if __name__ == '__main__':
    main()
