#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Complete voter data extraction pipeline
Modular workflow: PDF -> Pages -> Cells -> OCR -> CSV

Usage:
    python extract_pipeline.py <pdf_file> [<start_page> <end_page>] [--cleanup]

Example:
    python extract_pipeline.py voter_list.pdf              # Process all pages
    python extract_pipeline.py voter_list.pdf 1 10         # Process pages 1-10
    python extract_pipeline.py voter_list.pdf --cleanup    # All pages with cleanup
"""

import sys
import os
import argparse
import shutil
import time
from pathlib import Path
from pdf2image import pdfinfo_from_path

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from workflow.pdf_to_cells import PDFToCellsExtractor
from workflow.cells_to_csv import CellsToCSVExtractor


def get_pdf_page_count(pdf_path):
    """Get total number of pages in PDF"""
    try:
        info = pdfinfo_from_path(pdf_path)
        return info['Pages']
    except Exception as e:
        print(f"Warning: Could not determine page count: {e}")
        return None


class VoterExtractionPipeline:
    def __init__(self, pdf_path, start_page, end_page, output_dir="pipeline_output",
                 ocr_engine="tesseract", google_credentials_path=None,
                 batch_size=16, max_requests_per_minute=1800, gemini_api_key=None, gemini_model='gemini-2.5-flash'):
        self.pdf_path = pdf_path
        self.start_page = start_page
        self.end_page = end_page
        self.output_dir = output_dir
        self.ocr_engine = ocr_engine
        self.google_credentials_path = google_credentials_path
        self.gemini_api_key = gemini_api_key
        self.gemini_model = gemini_model
        self.batch_size = batch_size
        self.max_requests_per_minute = max_requests_per_minute

        # Create output directory
        os.makedirs(self.output_dir, exist_ok=True)

        # Subdirectories for intermediate files
        self.cells_dir = os.path.join(self.output_dir, "cells")
        self.csv_output = os.path.join(self.output_dir, "voter_data.csv")

    def run(self):
        """Execute the full pipeline"""
        start_time = time.time()

        print("=" * 80)
        print("VOTER DATA EXTRACTION PIPELINE")
        print("=" * 80)
        print(f"PDF: {self.pdf_path}")
        print(f"Pages: {self.start_page} to {self.end_page}")
        print(f"Output directory: {self.output_dir}")
        print("=" * 80)

        # Step 1: Extract cells from PDF pages
        print("\n[STEP 1/2] Extracting cells from PDF pages...")
        print("-" * 80)
        step1_start = time.time()

        cell_extractor = PDFToCellsExtractor(self.pdf_path)
        total_cells = cell_extractor.extract_pages(
            self.start_page,
            self.end_page,
            self.cells_dir
        )

        step1_time = time.time() - step1_start
        print(f"\n✓ Extracted {total_cells} cells from {self.end_page - self.start_page + 1} pages")
        print(f"  Cells saved to: {self.cells_dir}/")
        print(f"  Time: {step1_time:.1f}s")

        # Step 2: Extract data from cells to CSV
        print("\n[STEP 2/2] Extracting voter data from cells...")
        print("-" * 80)
        step2_start = time.time()

        csv_extractor = CellsToCSVExtractor(
            ocr_engine=self.ocr_engine,
            google_credentials_path=self.google_credentials_path,
            batch_size=self.batch_size,
            max_requests_per_minute=self.max_requests_per_minute,
            gemini_api_key=self.gemini_api_key,
            gemini_model=self.gemini_model
        )
        num_voters = csv_extractor.extract_all_cells(self.cells_dir, self.csv_output)

        step2_time = time.time() - step2_start
        print(f"\n✓ Extracted {num_voters} voter records")
        print(f"  CSV saved to: {self.csv_output}")
        print(f"  Time: {step2_time:.1f}s")

        # Summary
        total_time = time.time() - start_time
        print("\n" + "=" * 80)
        print("PIPELINE COMPLETE")
        print("=" * 80)
        print(f"Total pages processed: {self.end_page - self.start_page + 1}")
        print(f"Total cells extracted: {total_cells}")
        print(f"Total voter records: {num_voters}")
        print(f"\nTiming:")
        print(f"  Step 1 (Cell extraction): {step1_time:.1f}s")
        print(f"  Step 2 (OCR + CSV):       {step2_time:.1f}s")
        print(f"  Total time:               {total_time:.1f}s ({total_time/60:.1f} minutes)")
        print(f"\nFinal output: {self.csv_output}")
        print("=" * 80)

        return self.csv_output

    def cleanup(self):
        """Remove intermediate files and directories"""
        print("\n[CLEANUP] Removing intermediate files...")

        if os.path.exists(self.cells_dir):
            shutil.rmtree(self.cells_dir)
            print(f"  ✓ Removed: {self.cells_dir}/")

        # Keep only the CSV file
        print(f"\n✓ Cleanup complete. CSV retained at: {self.csv_output}")


def main():
    parser = argparse.ArgumentParser(
        description="Extract voter data from PDF to CSV",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Basic usage
  python extract_pipeline.py voter_list.pdf                              # Process all pages with Tesseract
  python extract_pipeline.py voter_list.pdf --cleanup                    # All pages with cleanup
  python extract_pipeline.py voter_list.pdf 1 10                         # Process pages 1-10
  python extract_pipeline.py voter_list.pdf 6 6 --cleanup                # Single page
  python extract_pipeline.py voter_list.pdf 1 20 --output data           # Custom output

  # Google Gemini 2.5 Pro (Best Gemini model, high accuracy for Bengali)
  python extract_pipeline.py voter_list.pdf --ocr gemini --gemini-api-key YOUR_API_KEY
  python extract_pipeline.py voter_list.pdf --ocr gemini --gemini-api-key YOUR_API_KEY --cleanup

  # Google Cloud Vision API (fastest with batch processing)
  python extract_pipeline.py voter_list.pdf --ocr google                 # Use Google Cloud Vision API
  python extract_pipeline.py voter_list.pdf --ocr google --google-creds google-credentials.json

  # Batch processing optimization (Google Vision only)
  python extract_pipeline.py voter_list.pdf --ocr google --batch-size 16 --rate-limit 1800  # Maximum speed
  python extract_pipeline.py voter_list.pdf --ocr google --batch-size 8  --rate-limit 900   # Half speed (if lower quota)
        """
    )

    parser.add_argument('pdf_file', help='Path to PDF file')
    parser.add_argument('start_page', nargs='?', type=int, default=None,
                       help='Starting page number (default: 1)')
    parser.add_argument('end_page', nargs='?', type=int, default=None,
                       help='Ending page number (default: last page)')
    parser.add_argument('--cleanup', action='store_true',
                       help='Remove intermediate files after extraction')
    parser.add_argument('--output', default='pipeline_output',
                       help='Output directory (default: pipeline_output)')
    parser.add_argument('--ocr', choices=['tesseract', 'google', 'gemini'], default='tesseract',
                       help='OCR engine to use: tesseract (free, default), google (Cloud Vision API, best overall), or gemini (Gemini 2.5 Pro, high accuracy)')
    parser.add_argument('--google-creds', default='google-credentials.json',
                       help='Path to Google Cloud credentials JSON file (default: google-credentials.json)')
    parser.add_argument('--gemini-api-key', default=None,
                       help='Google Gemini API key (required for --ocr gemini)')
    parser.add_argument('--gemini-model', default='gemini-2.5-flash',
                       help='Gemini model to use (default: gemini-2.5-flash). Options: gemini-2.5-flash-lite, gemini-2.5-flash, gemini-2.5-pro')
    parser.add_argument('--batch-size', type=int, default=16,
                       help='Batch size for Google Vision API (1-16, default: 16 for maximum speed)')
    parser.add_argument('--rate-limit', type=int, default=1800,
                       help='Maximum requests per minute for API calls (default: 1800)')

    args = parser.parse_args()

    # Validate inputs
    if not os.path.exists(args.pdf_file):
        print(f"ERROR: PDF file not found: {args.pdf_file}")
        sys.exit(1)

    # Determine page range
    if args.start_page is None and args.end_page is None:
        # No page range specified - process all pages
        total_pages = get_pdf_page_count(args.pdf_file)
        if total_pages is None:
            print("ERROR: Could not determine PDF page count")
            sys.exit(1)
        start_page = 1
        end_page = total_pages
        print(f"No page range specified. Processing all {total_pages} pages.")
    elif args.start_page is not None and args.end_page is not None:
        # Both specified
        start_page = args.start_page
        end_page = args.end_page
    else:
        print("ERROR: Please specify both start_page and end_page, or neither for all pages")
        sys.exit(1)

    # Validate page range
    if start_page < 1 or end_page < start_page:
        print("ERROR: Invalid page range")
        sys.exit(1)

    # Validate batch size
    if args.batch_size < 1 or args.batch_size > 16:
        print("WARNING: Batch size must be between 1 and 16. Using 16.")
        args.batch_size = 16

    # Run pipeline
    pipeline = VoterExtractionPipeline(
        args.pdf_file,
        start_page,
        end_page,
        args.output,
        ocr_engine=args.ocr,
        google_credentials_path=args.google_creds,
        batch_size=args.batch_size,
        max_requests_per_minute=args.rate_limit,
        gemini_api_key=args.gemini_api_key,
        gemini_model=args.gemini_model
    )

    pipeline.run()

    # Cleanup if requested
    if args.cleanup:
        pipeline.cleanup()


if __name__ == '__main__':
    main()
