#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extract voter data from cell images to CSV using template matching and OCR
Usage: python extract_to_csv.py <input_folder> [output_csv]
"""

import cv2
import numpy as np
from PIL import Image
import pytesseract
import pandas as pd
import re
import os
import sys

class VoterDataExtractor:
    def __init__(self):
        # Load all templates
        self.templates = {
            'name': cv2.imread('wider_templates/name_label.png', cv2.IMREAD_GRAYSCALE),
            'voter_id': cv2.imread('wider_templates/voter_id_label.png', cv2.IMREAD_GRAYSCALE),
            'father': cv2.imread('wider_templates/father_label.png', cv2.IMREAD_GRAYSCALE),
            'mother': cv2.imread('wider_templates/mother_label.png', cv2.IMREAD_GRAYSCALE),
            'profession': cv2.imread('wider_templates/profession_label.png', cv2.IMREAD_GRAYSCALE),
            'address': cv2.imread('wider_templates/address_label.png', cv2.IMREAD_GRAYSCALE),
            'dob': cv2.imread('wider_templates/dob_label.png', cv2.IMREAD_GRAYSCALE),
        }

        print("Templates loaded:")
        for name, template in self.templates.items():
            if template is not None:
                print(f"  {name}: {template.shape}")
            else:
                print(f"  {name}: MISSING!")

    def find_template(self, image_gray, template, threshold=0.7):
        """Find template in image"""
        if template is None:
            return None

        result = cv2.matchTemplate(image_gray, template, cv2.TM_CCOEFF_NORMED)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)

        if max_val >= threshold:
            h, w = template.shape
            return (*max_loc, w, h, max_val)
        return None

    def extract_text_region(self, image, x, y, w, h):
        """Extract text from region using OCR"""
        # Handle negative coordinates
        if y < 0:
            h = h + y
            y = 0
        if x < 0:
            w = w + x
            x = 0

        # Handle out of bounds
        img_h, img_w = image.shape[:2]
        if x + w > img_w:
            w = img_w - x
        if y + h > img_h:
            h = img_h - y

        if w <= 0 or h <= 0:
            return ""

        # Crop region
        region = image[y:y+h, x:x+w]
        region_pil = Image.fromarray(region)

        # OCR with Bengali + English
        text = pytesseract.image_to_string(region_pil, lang='ben+eng', config=r'--oem 3 --psm 7')
        return text.strip()

    def bengali_to_english_number(self, text):
        """Convert Bengali numerals to English"""
        if not text:
            return text
        trans = str.maketrans('০১২৩৪৫৬৭৮৯', '0123456789')
        return text.translate(trans)

    def clean_text(self, text):
        """Clean extracted text"""
        if not text:
            return text
        text = re.sub(r'[|\\<>{}[\]()"\']', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def extract_cell(self, cell_path):
        """Extract voter data from a single cell image"""
        cell_color = cv2.imread(cell_path)
        if cell_color is None:
            print(f"  ERROR: Could not read {cell_path}")
            return None

        cell_gray = cv2.cvtColor(cell_color, cv2.COLOR_BGR2GRAY)
        cell_h, cell_w = cell_color.shape[:2]

        voter = {}

        # Find all template positions
        positions = {}
        for template_name, template in self.templates.items():
            match = self.find_template(cell_gray, template)
            if match:
                positions[template_name] = match

        # If no templates found, likely a migrated cell
        if len(positions) == 0:
            return None

        # Padding constants (same as annotation script)
        V_PAD_TOP = 3
        V_PAD_BOTTOM = 8
        H_GAP = 5

        # 1. SERIAL NUMBER - left of name label
        if 'name' in positions:
            name_x, name_y, name_w, name_h, name_score = positions['name']

            serial_x = 0
            serial_y = name_y - V_PAD_TOP
            serial_w = name_x - 3
            serial_h = name_h + V_PAD_TOP + V_PAD_BOTTOM

            if serial_w > 10:
                serial_text = self.extract_text_region(cell_color, serial_x, serial_y, serial_w, serial_h)
                serial_match = re.search(r'([০-৯0-9]{1,4})', serial_text)
                if serial_match:
                    serial = self.bengali_to_english_number(serial_match.group(1))
                    voter['serial_no'] = serial.zfill(4)

        # 2. NAME - right of name label
        if 'name' in positions:
            name_x, name_y, name_w, name_h, name_score = positions['name']

            value_x = name_x + name_w + H_GAP
            value_y = name_y - V_PAD_TOP
            value_w = cell_w - value_x - 5
            value_h = name_h + V_PAD_TOP + V_PAD_BOTTOM

            name_text = self.extract_text_region(cell_color, value_x, value_y, value_w, value_h)
            name_clean = self.clean_text(name_text)
            if len(name_clean) >= 3:
                voter['name'] = name_clean

        # 3. VOTER ID - right of voter_id label
        if 'voter_id' in positions:
            label_x, label_y, label_w, label_h, score = positions['voter_id']

            value_x = label_x + label_w + H_GAP
            value_y = label_y - V_PAD_TOP
            value_w = cell_w - value_x - 5
            value_h = label_h + V_PAD_TOP + V_PAD_BOTTOM

            text = self.extract_text_region(cell_color, value_x, value_y, value_w, value_h)
            vid_clean = re.sub(r'[^\d]', '', self.bengali_to_english_number(text))
            if 10 <= len(vid_clean) <= 14:
                voter['voter_id'] = vid_clean

        # 4. FATHER - right of father label
        if 'father' in positions:
            label_x, label_y, label_w, label_h, score = positions['father']

            value_x = label_x + label_w + H_GAP
            value_y = label_y - V_PAD_TOP
            value_w = cell_w - value_x - 5
            value_h = label_h + V_PAD_TOP + V_PAD_BOTTOM

            text = self.extract_text_region(cell_color, value_x, value_y, value_w, value_h)
            father = self.clean_text(text)
            if len(father) >= 3:
                voter['father_name'] = father

        # 5. MOTHER - right of mother label
        if 'mother' in positions:
            label_x, label_y, label_w, label_h, score = positions['mother']

            value_x = label_x + label_w + H_GAP
            value_y = label_y - V_PAD_TOP
            value_w = cell_w - value_x - 5
            value_h = label_h + V_PAD_TOP + V_PAD_BOTTOM

            text = self.extract_text_region(cell_color, value_x, value_y, value_w, value_h)
            mother = self.clean_text(text)
            if len(mother) >= 3:
                voter['mother_name'] = mother

        # 6. PROFESSION - right of profession label, before DOB label
        if 'profession' in positions:
            prof_x, prof_y, prof_w, prof_h, score = positions['profession']

            # If DOB found, stop before it
            if 'dob' in positions:
                dob_x, dob_y, dob_w, dob_h, dob_score = positions['dob']
                value_w = dob_x - (prof_x + prof_w) - H_GAP
            else:
                value_w = cell_w - (prof_x + prof_w) - 5

            value_x = prof_x + prof_w + H_GAP
            value_y = prof_y - V_PAD_TOP
            value_h = prof_h + V_PAD_TOP + V_PAD_BOTTOM

            text = self.extract_text_region(cell_color, value_x, value_y, value_w, value_h)
            profession = self.clean_text(text)
            profession = re.sub(r'[,।]', '', profession).strip()
            if len(profession) >= 2:
                voter['profession'] = profession

        # 7. DATE OF BIRTH - right of dob label
        if 'dob' in positions:
            label_x, label_y, label_w, label_h, score = positions['dob']

            value_x = label_x + label_w + H_GAP
            value_y = label_y - V_PAD_TOP
            value_w = cell_w - value_x - 5
            value_h = label_h + V_PAD_TOP + V_PAD_BOTTOM

            text = self.extract_text_region(cell_color, value_x, value_y, value_w, value_h)
            dob_match = re.search(r'([০-৯0-9]{2}[/.]?[০-৯0-9]{2}[/.]?[০-৯0-9]{4})', text)
            if dob_match:
                dob = self.bengali_to_english_number(dob_match.group(1))
                dob = dob.replace('.', '/')
                voter['date_of_birth'] = dob

        # 8. ADDRESS - right of address label (full remaining height)
        if 'address' in positions:
            label_x, label_y, label_w, label_h, score = positions['address']

            value_x = label_x + label_w + H_GAP
            value_y = label_y - V_PAD_TOP
            value_w = cell_w - value_x - 5
            value_h = cell_h - value_y - 5  # To bottom of cell

            text = self.extract_text_region(cell_color, value_x, value_y, value_w, value_h)
            address = self.clean_text(text)
            if len(address) >= 5:
                voter['address'] = address

        return voter if voter else None


def main():
    if len(sys.argv) < 2:
        print("Usage: python extract_to_csv.py <input_folder> [output_csv]")
        print("\nExample:")
        print("  python extract_to_csv.py page6_cells")
        print("  python extract_to_csv.py page6_cells voter_data.csv")
        sys.exit(1)

    input_folder = sys.argv[1]
    output_csv = sys.argv[2] if len(sys.argv) > 2 else f"{input_folder}_data.csv"

    if not os.path.exists(input_folder):
        print(f"ERROR: Input folder not found: {input_folder}")
        sys.exit(1)

    # Initialize extractor
    print("\n" + "=" * 70)
    print("VOTER DATA EXTRACTION - Cell Images to CSV")
    print("=" * 70)

    extractor = VoterDataExtractor()

    # Get all cell images (exclude annotated ones and box images)
    cell_files = sorted([
        f for f in os.listdir(input_folder)
        if f.endswith('.png') and 'annotated' not in f.lower() and 'boxes' not in f.lower()
    ])

    print(f"\nInput folder: {input_folder}")
    print(f"Total cell images: {len(cell_files)}")
    print(f"Output CSV: {output_csv}\n")

    # Extract data from each cell
    all_voters = []

    for idx, cell_file in enumerate(cell_files, 1):
        cell_path = os.path.join(input_folder, cell_file)

        print(f"[{idx}/{len(cell_files)}] Processing {cell_file}...", end=' ')

        voter = extractor.extract_cell(cell_path)

        if voter:
            voter['source_file'] = cell_file
            all_voters.append(voter)
            print(f"✓ {voter.get('name', 'N/A'):30s} | ID: {voter.get('voter_id', 'N/A')}")
        else:
            print("✗ (migrated/empty)")

    # Create DataFrame
    print("\n" + "=" * 70)
    print("EXTRACTION COMPLETE")
    print("=" * 70)
    print(f"Total voters extracted: {len(all_voters)}")

    if all_voters:
        df = pd.DataFrame(all_voters)

        # Reorder columns
        column_order = ['source_file', 'serial_no', 'voter_id', 'name',
                       'father_name', 'mother_name', 'address', 'date_of_birth', 'profession']

        # Add missing columns with empty strings
        for col in column_order:
            if col not in df.columns:
                df[col] = ''

        df = df[column_order]

        # Save to CSV
        df.to_csv(output_csv, index=False, encoding='utf-8-sig')
        print(f"\n✓ Saved to: {output_csv}")

        # Show sample
        print(f"\nFirst 5 records:")
        print(df.head().to_string())

        # Show statistics
        print(f"\n" + "=" * 70)
        print("DATA COMPLETENESS")
        print("=" * 70)
        print(f"Total records: {len(df)}")
        print(f"\nField coverage:")
        for col in ['voter_id', 'name', 'father_name', 'mother_name', 'date_of_birth', 'profession', 'address']:
            filled = (df[col] != '').sum()
            pct = (filled / len(df)) * 100 if len(df) > 0 else 0
            print(f"  {col:15s}: {filled:4d}/{len(df):4d} ({pct:5.1f}%)")
    else:
        print("\n✗ No voters extracted!")

if __name__ == '__main__':
    main()
