
    Xil                     t    d Z ddlZddlZddlmZ ddlZddlZddl	Z	ddl
Z
ddlZddlmZ  G d d          ZdS )z
Module: Cells to CSV Extractor
Extracts voter data from cell images using OCR
Supports both Tesseract and Google Cloud Vision API with batch processing
    N)Image)defaultdictc                       e Zd Z	 	 ddZddZd	 ZddZd Zd ZddZ	d Z
d Zd ZddZd Zd Zd Zd Zd Zd ZdS )CellsToCSVExtractor	tesseractN     gemini-2.5-flashc           
         |                                 | _        t          |d          | _        || _        g | _        d| _        d| _        | j        dk    r	 ddlm	} |r|t          j        d<   |                                | _        t          d| j         d           t          d	| j         d
           n]# t          $ r) t          d           t          d           d| _        Y n+t          $ r2}t          d|            t          d           d| _        Y d}~nd}~ww xY w| j        dk    r	 ddlm}	 |r|	                    |           |	                    |          | _        t          d|            t          d	| j         d
           n~# t          $ r( t          d           t          d           d| _        Y nMt          $ r2}t          d|            t          d           d| _        Y d}~nd}~ww xY wt          d           t          j                            t          j                            t.                    dd          }
t1          j        |
 dt0          j                  t1          j        |
 dt0          j                  t1          j        |
 dt0          j                  t1          j        |
 dt0          j                  t1          j        |
 dt0          j                  t1          j        |
 dt0          j                  t1          j        |
 dt0          j                  d | _        d!| _        d"| _        d#| _        dS )$a6  
        Initialize the extractor with specified OCR engine

        Args:
            ocr_engine: 'tesseract', 'google', or 'gemini' (default: 'tesseract')
            google_credentials_path: Path to Google Cloud credentials JSON file
            batch_size: Number of images to process per batch (max 16 for Google, default: 16)
            max_requests_per_minute: Rate limit for API calls (default: 1800)
            gemini_api_key: Google Gemini API key (for gemini ocr_engine)
            gemini_model: Gemini model name (default: 'gemini-2.5-flash')
        r   Ngoogler   visionGOOGLE_APPLICATION_CREDENTIALSz5  Using Google Cloud Vision API for OCR (batch mode: z images)z  Rate limit: z requests/minutezH  Warning: google-cloud-vision not installed. Falling back to Tesseract.z/  Install with: pip install google-cloud-visionr   z5  Warning: Failed to initialize Google Cloud Vision: z  Falling back to Tesseract.gemini)api_keyz  Using Google Gemini model: zH  Warning: google-generativeai not installed. Falling back to Tesseract.z/  Install with: pip install google-generativeaiz/  Warning: Failed to initialize Google Gemini: z  Using Tesseract OCRz..wider_templatesz/name_label.pngz/voter_id_label.pngz/father_label.pngz/mother_label.pngz/profession_label.pngz/address_label.pngz/dob_label.png)namevoter_idfathermother
professionaddressdob         )lower
ocr_enginemin
batch_sizemax_requests_per_minuterequest_timesvision_clientgemini_modelgoogle.cloudr   osenvironImageAnnotatorClientprintImportError	Exceptiongoogle.generativeaigenerativeai	configureGenerativeModelpathjoindirname__file__cv2imreadIMREAD_GRAYSCALE	templates	V_PAD_TOPV_PAD_BOTTOMH_GAP)selfr   google_credentials_pathr    r!   gemini_api_keyr$   r   egenaitemplate_dirs              K/var/www/development/aibuddy-work/election-extract/workflow/cells_to_csv.py__init__zCellsToCSVExtractor.__init__   s    %**,,j"--'>$ " ?h&&.//////* [CZBJ?@%+%@%@%B%B"gdoggghhhUt'CUUUVVVV . . .`aaaGHHH"- . . .QaQQRRR4555"-. _((.333333! <OONO;;;$)$9$9,$G$G!DlDDEEEUt'CUUUVVVV . . .`aaaGHHH"- . . .KKKLLL4555"-.
 *+++ w||BGOOH$=$=tEVWW J,???AUVV
l#G#G#GI]^^jL!C!C!CSEYZZjL!C!C!CSEYZZ*%K%K%KSMabbz\"E"E"EsG[\\:===s?STT
 
 


s>   A B9 9/D&+	D&4(D!!D&5A"F /H		H(G??Hffffff?c                     |dS t          j        ||t           j                  }t          j        |          \  }}}}||k    r|j        \  }	}
g ||
|	|R S dS )zFind template in imageN)r4   matchTemplateTM_CCOEFF_NORMED	minMaxLocshape)r;   
image_graytemplate	thresholdresultmin_valmax_valmin_locmax_lochws              rA   find_templatez!CellsToCSVExtractor.find_templatea   st    4":x9MNN-0]6-B-B*'7i>DAq,W,a,,G,,,t    c                 2   t          |j                  dk    r t          j        |t          j                  }n|                                }t          j        dd          }|                    |          }t          j        |t          j                  }|S )zMEnhance image for better OCR by reducing watermark and boosting text contrastr   g      @)r   r   )	clipLimittileGridSize)	lenrH   r4   cvtColorCOLOR_BGR2GRAYcopycreateCLAHEapplyCOLOR_GRAY2BGR)r;   imagegrayclaheenhancedrL   s         rA   _enhance_for_ocrz$CellsToCSVExtractor._enhance_for_ocrn   s~     u{q  <s'9::DD::<<D #EBBB;;t$$ h(:;;rT      c           	         t          j                     fd| j        D             | _        t          | j                  |z   | j        k    r| j        r| j        d         }d|z
  z
  }|dk    rqt	          dt          | j                   d| j         d|dd           t          j        |           t          j                     fd	| j        D             | _        n| j                            d           nn t          | j                  |z   | j        k    t          |          D ]}| j                                       d
S )z
        Check and enforce rate limiting for API calls

        Args:
            num_images: Number of images in this request (each counts toward quota)
        c                 &    g | ]}|z
  d k     |S <    .0tcurrent_times     rA   
<listcomp>z8CellsToCSVExtractor.rate_limit_check.<locals>.<listcomp>   s)    UUUA|a?ORT?T?Ta?T?T?TrT   r   gN@z    Rate limit: /z used. Waiting .1fzs...c                 &    g | ]}|z
  d k     |S rg   ri   rj   s     rA   rn   z8CellsToCSVExtractor.rate_limit_check.<locals>.<listcomp>   s*    )a)a)a<Z[K[^`K`K`!K`K`K`rT   N)	timer"   rX   r!   r)   sleeppoprangeappend)r;   
num_imagesoldest_time
sleep_time_rm   s        @rA   rate_limit_checkz$CellsToCSVExtractor.rate_limit_check   s    y{{ VUUU);UUU $$%%
2T5QQQ! "03!\K%?@
>>  IS1C-D-D  I  ItGc  I  It~  I  I  I  I  J  J  JJz***#'9;;L)a)a)a)aT5G)a)a)aD&& &**1---- # $$%%
2T5QQQ( z"" 	4 	4A%%l3333	4 	4rT   c                    | j         si S ddlm} i }t          |          | j        z   dz
  | j        z  }t          t          dt          |          | j                  d          D ]\  }}t          || j        z   t          |                    }|||         }|                     t          |                     t          d| d| dt          |           dt          | j
                   d		           g }	|D ]\  }
}|                     |
          }t          j        d
|          \  }}|                                }|                    |          }|	                    |d|j        j        j        igd           	 | j                             |	          }t          |j                  D ]Y\  }}||         d         }|j        j        rd||<   %|j        r(|j        d         j                                        ||<   Td||<   Z# t6          $ r*}t          d|            |D ]
\  }}d||<   Y d}~d}~ww xY w|S )z
        Process multiple image regions in a single batch request

        Args:
            image_regions: List of tuples (image_data, region_id)

        Returns:
            Dict mapping region_id to extracted text
        r   r   rd   rw       Processing batch ro    (z	 images,  in quota window)....pngcontenttype_r_   featuresrequests     Batch OCR error: N)r#   r%   r   rX   r    	enumerateru   r   r{   r)   r"   rc   r4   imencodetobytesr   rv   FeatureTypeTEXT_DETECTIONbatch_annotate_images	responseserrormessagetext_annotationsdescriptionstripr+   )r;   image_regionsr   resultstotal_batches	batch_idxbatch_start	batch_endbatchr   
image_data	region_idprocessed_imagerz   encoded_imager   r_   responseidximage_responser>   s                        rA   batch_ocr_googlez$CellsToCSVExtractor.batch_ocr_google   s    ! 	I'''''']++do=AdoU '0a]9K9KT_0]0]_`&a&a -	, -	,"I{K$/93};M;MNNI!+i"78E !!SZZ!888  J)  J  Jm  J  Js5zz  J  J\_`d`r\s\s  J  J  J  K  K  K H).  %
I"&"7"7
"C"C $'<#H#H ='//11 W55"")6>+>+M!N O! !    
,-CCXCVV ,5X5G+H+H 0 0'C %c
1I%+3 0-/	**'8 0-;-LQ-O-[-a-a-c-c	**-/	**0  , , ,1a11222$) , ,LAy)+GI&&, , , , ,, s   B	H
IH>>Ic                    |dk     r||z   }d}|dk     r||z   }d}|j         dd         \  }}||z   |k    r||z
  }||z   |k    r||z
  }|dk    s|dk    rdS ||||z   |||z   f         }	 ddlm}	 t          j        d|          \  }
}|                                }|	                    |          }| j                            |          }|j	        j
        rt          d	|j	        j
                    dS |j        }|r|d         j                                        S dS # t          $ r}t          d
|            Y d}~dS d}~ww xY w)zExtract text using Google Cloud Vision API

        Args:
            image: Source image
            x, y, w, h: Region coordinates
        r   N   r   r   r   r   )r_   z    Google Vision API error: z!    Google Vision API exception: )rH   r%   r   r4   r   r   r   r#   text_detectionr   r   r)   r   r   r   r+   )r;   r_   xyrR   rQ   img_himg_wregionr   rz   r   r   vision_imager   textsr>   s                    rA   extract_text_region_googlez.CellsToCSVExtractor.extract_text_region_google   s    q55AAAq55AAA{2A2uq55==	Aq55==	A66Q!VV2 q1ua!e|$	++++++"|FF;;A}#++--G "<<<88L )88|8LLH~% Nhn6LNNOOOr-E 4Qx+113332 	 	 	9a99:::22222	s   ,BD# 9'D# #
E
-EE
Fc                    | j         dk    r | j        r|                     |||||          S |dk     r||z   }d}|dk     r||z   }d}|j        dd         \  }}||z   |k    r||z
  }||z   |k    r||z
  }|dk    s|dk    rdS ||||z   |||z   f         }	t	          |	j                  dk    r t          j        |	t
          j                  }
n|	}
|r2t          j        |
ddt
          j	        t
          j
        z             \  }}n|
}t          j        |          }t          j        |dd	
          }|                                S )zExtract text from region using configured OCR engine

        Args:
            image: Source image
            x, y, w, h: Region coordinates
            use_binary: If True, apply binary threshold (only for Tesseract)
        r   r   Nr   r   r      zben+engz--oem 3 --psm 7)langconfig)r   r#   r   rH   rX   r4   rY   rZ   rK   THRESH_BINARYTHRESH_OTSUr   	fromarraypytesseractimage_to_stringr   )r;   r_   r   r   rR   rQ   
use_binaryr   r   r   r`   rz   	processed
region_piltexts                  rA   extract_text_regionz'CellsToCSVExtractor.extract_text_region  s}    ?h&&4+=&225!Q1EEE q55AAAq55AAA {2A2uq55==	Aq55==	A66Q!VV2 q1ua!e|$ v|!!<(:;;DDD 	 =q#s7H3?7Z[[LAyy I _Y//
 *:IN`aaazz||rT   c                 j    |s|S t                               dd          }|                    |          S )z#Convert Bengali numerals to Englishu   ০১২৩৪৫৬৭৮৯
0123456789)str	maketrans	translate)r;   r   transs      rA   bengali_to_english_numberz-CellsToCSVExtractor.bengali_to_english_numberU  s5     	K>MM~~e$$$rT   c                     |s|S t          j        dd|          }t          j        dd|                                          }|S )zClean extracted textz[|\\<>{}[\]()"\']r   z\s+ )resubr   )r;   r   s     rA   
clean_textzCellsToCSVExtractor.clean_text\  sG     	Kv*B55vfc4((..00rT   c                 	   t          j        |          }|dS t          j        |t           j                  }|j        dd         \  }}i }| j                                        D ]"\  }	}
|                     ||
          }|r|||	<   #t          |          dk    rdS g }d|v rx|d         \  }}}}}d|| j	        z
  }}|dz
  }|| j	        z   | j
        z   }|dk    rA|dk    r;|                     |||||          }| d| d	| d
}|                    ||f           d|v r||d         \  }}}}}||z   | j        z   }|| j	        z
  }||z
  dz
  }|| j	        z   | j
        z   }|                     |||||d          }| d| d	| d}|                    ||f           d|v r|d         \  }}}}}||z   | j        z   }|| j	        z
  dz
  }||z
  dz
  }|| j	        z   | j
        z   dz   }|                     |||||d          }| d| d	| d}|                    ||f           d|v r||d         \  }}}}}||z   | j        z   }|| j	        z
  }||z
  dz
  }|| j	        z   | j
        z   }|                     |||||d          }| d| d	| d}|                    ||f           d|v r||d         \  }}}}}||z   | j        z   }|| j	        z
  }||z
  dz
  }|| j	        z   | j
        z   }|                     |||||d          }| d| d	| d}|                    ||f           d|v r|d         \  } }!}"}#}d|v r|d         \  }$}}}}|$| |"z   z
  | j        z
  }n|| |"z   z
  dz
  }| |"z   | j        z   }|!| j	        z
  }|#| j	        z   | j
        z   }|                     |||||d          }| d| d	| d}|                    ||f           d|v r||d         \  }}}}}||z   | j        z   }|| j	        z
  }||z
  dz
  }|| j	        z   | j
        z   }|                     |||||d          }| d| d	| d}|                    ||f           d|v rp|d         \  }}}}}||z   | j        z   }|| j	        z
  }||z
  dz
  }||z
  dz
  }|                     |||||          }| d| d	| d}|                    ||f           |||||ffS )z
        Prepare regions from a cell for batch OCR processing

        Returns:
            Tuple of (regions_list, positions, cell_color, cell_dimensions)
            where regions_list contains (image_data, region_id) tuples
        N)NNNNr   r   r   r   
   p_c_serialr   T)trim_right_whitespace_namer      	_voter_idr   _fatherr   _motherr   r   _profession_dobr   _address)r4   r5   rY   rZ   rH   r7   itemsrS   rX   r8   r9   _crop_regionrv   r:   )%r;   	cell_pathpage_numcell_num
cell_color	cell_graycell_hcell_w	positionstemplate_namerJ   matchregionsname_xname_yname_wname_hrz   serial_xserial_yserial_wserial_hr   r   value_xvalue_yvalue_wvalue_hlabel_xlabel_ylabel_wlabel_hprof_xprof_yprof_wprof_hdob_xs%                                        rA   extract_regions_for_batchz-CellsToCSVExtractor.extract_regions_for_batchd  s    Z	**
))LS-?@@	#)"1"- 	'+~';';'='= 	1 	1#M8&&y(;;E 1+0	-( y>>Q)) Y09&0A-FFFFA!"FT^$;hHzH.1BBH"}}Q**:x8U]^^% AH A A A A AINNFI#6777 Y09&0A-FFFFAvo
2Gt~-Gw&*Gt~-0AAG&&z7GWgmq&rrF!;;;H;;;		2333 ""4=j4I1GWgw'$*4G.3Gw&*G.1BBRGG&&z7GWgmq&rrF!???H???		2333 y  4=h4G1GWgw'$*4G.Gw&*G.1BBG&&z7GWgmq&rrF!===H===		2333 y  4=h4G1GWgw'$*4G.Gw&*G.1BBG&&z7GWgmq&rrF!===H===		2333 9$$09,0G-FFFFA	!!$-e$4!q!Q6F?3dj@ FVO4q8vo
2Gt~-Gt~-0AAG&&z7GWgmq&rrF!AAAHAAA		2333 I4=e4D1GWgw'$*4G.Gw&*G.1BBG&&z7GWgmq&rrF!:::H:::		2333 	!!4=i4H1GWgw'$*4G.Gw&*Gw&*G&&z7GWgVVF!>>>H>>>		2333	:/???rT   c                    |dk     r||z   }d}|dk     r||z   }d}|j         dd         \  }}||z   |k    r||z
  }||z   |k    r||z
  }|dk    s|dk    rdS ||||z   |||z   f         }	|rt          |	j                   dk    r t          j        |	t          j                  }
n|	}
d}d}t          |
j         d         dz
  dd          D ]F}|
dd|f         }t          j        ||k               }|dk    r|t          |          d	z  k     r|} nG|dk    r.d
}t          ||z   |	j         d                   }|	ddd|f         }	|	S )zSafely crop a region from an image

        Args:
            image: Source image
            x, y, w, h: Region coordinates
            trim_right_whitespace: If True, crop white space from right side until black text found
        r   Nr   r      rd      g?   )	rH   rX   r4   rY   rZ   ru   npsumr   )r;   r_   r   r   rR   rQ   r   r   r   r   r`   dark_thresholdrightmost_textcolcolumn_pixels
dark_countmargin	new_widths                     rA   r   z CellsToCSVExtractor._crop_region  s    q55AAAq55AAA {2A2uq55==	Aq55==	A66Q!VV4 q1ua!e|$ ! 	/6<  A%%|FC,>?? !N NTZ]Q.B77   $QQQVVMN$BCC
 >>jC4F4F4L&M&M%(NE !! 7aII	:I:.rT   c                    | j         si S ddlm} i }t          |          | j        z   dz
  | j        z  }t          t          dt          |          | j                  d          D ]\  }}t          || j        z   t          |                    }|||         }|                     t          |                     t          d| d| dt          |           dt          | j
                   d		           g }	|D ]s\  }
}}}t          j        d
|
          \  }}|                                }|                    |          }|	                    |d|j        j        j        igd           t	 | j                             |	          }t          |j                  D ]`\  }}||         \  }
}}}|j        j        rd|||f<   &|j        r,|j        d         j                                        }||||f<   Yd|||f<   a# t4          $ r.}t          d|            |D ]\  }
}}}d|||f<   Y d}~d}~ww xY w|S )a  
        Process full cell images in batches with Google Cloud Vision

        Args:
            cell_images: List of tuples (cell_image, page_num, cell_num, cell_path)

        Returns:
            Dict mapping (page_num, cell_num) to full OCR text
        r   r   rd   r}   r~   ro   r   z cells, r   r   r   r   r   r   r   r   N)r#   r%   r   rX   r    r   ru   r   r{   r)   r"   r4   r   r   r   rv   r   r   r   r   r   r   r   r   r   r   r+   )r;   cell_imagesr   r   r   r   r   r   r   r   
cell_imager   r   r   rz   r   r   r_   r   r   r   	full_textr>   s                          rA   batch_ocr_full_cellsz(CellsToCSVExtractor.batch_ocr_full_cells/  s    ! 	I''''''[))DO;a?DOS '0a[9I9I4?0[0[]^&_&_ ,	7 ,	7"I{K$/93{;K;KLLII 56E !!SZZ!888  I)  I  Im  I  Is5zz  I  I[^_c_q[r[r  I  I  I  J  J  J H=B 
 
9
Hh	#&<
#C#C ='//11 W55"")6>+>+M!N O! !    
7-CCXCVV ,5X5G+H+H 
; 
;'C@Ec
=J(I%+3 ;8:8 455'8 ;$2$CA$F$R$X$X$Z$Z	8A8 4558:8 455
;  7 7 71a11222AF 7 7=J(I46GXx0117 7 7 7 77 s   1BH
H;#H66H;c                    | j         si S i }t          |          }d}t          |d          D ]K\  }\  }}}}	|                     d           |dz  dk    s||k    r+t	          d| d| dt          | j                   d	           	 t          |j                  d
k    r t          j        |t          j	                  }
nt          j        |t          j
                  }
t          j        |
          }| j                             ||g          }|r(|j        r!|j                                        }||||f<   nd|||f<   # t           $ r*}t	          d| d| d|            d|||f<   Y d}~Ed}~ww xY w|S )a   
        Process full cell images with Google Gemini 2.5 Flash-Lite

        Args:
            cell_images: List of tuples (cell_image, page_num, cell_num, cell_path)

        Returns:
            Dict mapping (page_num, cell_num) to full OCR text
        u  Extract all text from this Bengali voter information card image with MAXIMUM ACCURACY.

CRITICAL: Pay special attention to Bengali numerals (০১২৩৪৫৬৭৮৯) which appear in:
- Serial numbers (example: ০০০১, ০০১৫)
- Voter ID numbers (example: ৬৮০৩২৩১৬৯৬৫১)
- Dates (example: ০১/০১/১৯৮০)

The image contains voter details in Bengali language with fields like:
- Serial number (4 digits with dot)
- নাম (Name)
- ভোটার নং (Voter ID Number) - VERY IMPORTANT: 12-13 digit number
- পিতা (Father's Name)
- মাতা (Mother's Name)
- পেশা (Profession)
- জন্ম তারিখ (Date of Birth)
- ঠিকানা (Address)

Extract ALL text EXACTLY as it appears in Bengali script, maintaining original spelling and numbers.
Return ONLY the raw text without any translations, formatting, or added labels.rd   r}   r   r   z    Processing image ro   r   r   r   r   z    Gemini OCR error for page z cell : N)r$   rX   r   r{   r)   r"   rH   r4   rY   COLOR_BGR2RGBCOLOR_GRAY2RGBr   r   generate_contentr   r   r+   )r;   r  r   total_imagespromptr   r  r   r   r   	rgb_image	pil_imager   r  r>   s                  rA   batch_ocr_full_cells_geminiz/CellsToCSVExtractor.batch_ocr_full_cells_geminir  s      	I;''S* AJ+WX@Y@Y 	3 	3<C<*h)!!Q!///Rx1}}| 3 3qcqqLqqCHZD[D[qqqrrr3 z'((A-- #Z9J K KII #Z9K L LI!OI66	  ,==vy>QRR  7 7 ( 3 3 5 5I4=GXx01146GXx01 3 3 3VxVVxVVSTVVWWW028,------3 s   	B8E
E7E22E7c                    |                     ||fd          }|sdS ||d}|                    d          }|D ]}|                                }t          j        d|          rWt          j        d|          }|r@|                     |                    d                    }	|	                    d          |d	<   d
|v sd|v rCt          j	        dd|          }
| 
                    |
          }t          |          dk    r||d<   d|v sd|v rat          j	        dd|          }t          j	        dd|                     |                    }dt          |          cxk    rdk    rn n||d<   d|v sd|v rCt          j	        dd|          }| 
                    |          }t          |          dk    r||d<   d|v sd|v rCt          j	        dd|          }| 
                    |          }t          |          dk    r||d<   d|v sd|v r|t          j	        d d|          }d!|v r|                    d!          }| 
                    |d"                   }t          j	        d#d|                                          }t          |          d$k    r||d%<   t          |          dk    r|d                             d&d                                          }t          j        d'|          }|rC|                     |                    d                    }|                    d(d)          }||d*<   nU| 
                    |          }t          j	        d#d|                                          }t          |          d$k    r||d%<   d+|v sd,|v rCt          j	        d-d|          }| 
                    |          }t          |          d.k    r||d/<   t          |          d$k    r|ndS )0z*Parse full cell OCR text into voter recordr   Npagecell
u   ^[০-৯0-9।|]+\.u   ([০-৯0-9]{1,4})\.rd      	serial_nou
   নাম:u   নাম :u   .*?নাম\s*:\s*r   r   u   ভোটার নং:u   ভোটার নং :u    .*?ভোটার নং\s*:\s*[^\d]r      r   u   পিতা:u   পিতা :u   .*?পিতা\s*:\s*father_nameu   মাতা:u   মাতা :u   .*?মাতা\s*:\s*mother_nameu   পেশা:u   পেশা :u   .*?পেশা\s*:\s*u   জন্ম তারিখr      [,।]r   r   :9   ([০-৯0-9]{2}[/.]?[০-৯0-9]{2}[/.]?[০-৯0-9]{4}).ro   date_of_birthu   ঠিকানা:u   ঠিকানা :u   .*?ঠিকানা\s*:\s*r   r   )getsplitr   r   r   searchr   groupzfillr   r   rX   replace)r;   ocr_resultsr   r   r  voterlineslineserial_matchserial	name_text
name_cleanr   	vid_cleanr   r   	prof_textpartsr   dob_text	dob_matchr   r   s                          rA   parse_full_cell_ocrz'CellsToCSVExtractor.parse_full_cell_ocr  s>   OOXx$8"==	 	4 
 
 %%  E	/ E	/D::<<D x/66 9!y)A4HH 9!;;L<N<Nq<Q<QRRF)/aE+& t##}'<'<F#92tDD	!__Y77
z??a''$.E&M )D004NRV4V4VvA2tLLF8R1O1OPT1U1UVV	Y----2-----(1E*% $&&*:d*B*B 92tDD00v;;!##+1E-( $&&*:d*B*B 92tDD00v;;!##+1E-( $&&*:d*B*BF#<b$GG	1Y>>%OO,JKKE!%q!:!:J!#	2z!B!B!H!H!J!JJ:!++.8l+ 5zzA~~#(8#3#3C#<#<#B#B#D#D$&I.jlt$u$u	$ 9"&"@"@QRASAS"T"TC"%++c3"7"7C58E/2!%!;!;J!#	2z!B!B!H!H!J!JJ:!++.8l+ %,,0F$0N0N&!@"dKK//'22w<<1$$'.E)$E

QuuD0rT   c                    ||d}d| d| d}||v r_||         }t          j        d|          }|r@|                     |                    d                    }|                    d          |d<   d| d| d	}	|	|v r3|                     ||	                   }
t          |
          d
k    r|
|d<   d| d| d}||v rS||         }t          j        dd|                     |                    }dt          |          cxk    rdk    rn n||d<   d| d| d}||v r3|                     ||                   }t          |          d
k    r||d<   d| d| d}||v r3|                     ||                   }t          |          d
k    r||d<   d| d| d}||v r[|                     ||                   }t          j        dd|                                          }t          |          dk    r||d<   d| d| d}||v rb||         }t          j        d|          }|rC|                     |                    d                    }|	                    dd          }||d<   d| d| d}||v r3|                     ||                   }t          |          d k    r||d!<   t          |          dk    r|nd"S )#z)Parse batch OCR results into voter recordr  r   r   r      ([০-৯0-9]{1,4})rd   r  r   r   r   r   r   r!  r   r   r"  r   r   r#  r   r$  r   r%  r   r   r   r'  r(  ro   r)  r   r   r   N)
r   r,  r   r-  r.  r   rX   r   r   r/  )r;   r0  r   r   r1  
serial_keyserial_textr4  r5  name_keyr6  vid_keyr   r8  
father_keyr   
mother_keyr   prof_keyr   dob_keyr<  r   addr_keyr   s                            rA   parse_batch_resultsz'CellsToCSVExtractor.parse_batch_results  s    
 
 766X666
$$%j1K9%;[IIL 5778J8J18M8MNN%+\\!__k" 3x228222{""H(=>>I9~~"" )f 6h55(555k!!w'DxT-K-KD-Q-QRRIS^^))))r)))))$-j! 766X666
$$__[%<==F6{{a'-m$ 766X666
$$__[%<==F6{{a'-m$ 9x888888{""X)>??J	2z::@@BBJ:!##&0l# 1h00(000k!!w'D	"^`deeI -44Y__Q5G5GHHkk#s++),o& 6x558555{""ook(&;<<G7||q  #*i E

QuuD0rT   c                 p   t          j        |          }|dS t          j        |t           j                  }|j        dd         \  }}||d}i }	| j                                        D ]"\  }
}|                     ||          }|r||	|
<   #t          |	          dk    rdS d|	v r|	d         \  }}}}}d}|| j	        z
  }|dz
  }|| j	        z   | j
        z   }|dk    rr|                     |||||d	          }t          j        d
|          }|r@|                     |                    d                    }|                    d          |d<   d|	v r|	d         \  }}}}}||z   | j        z   }|| j	        z
  }||z
  dz
  }|| j	        z   | j
        z   }|                     |||||          }|                     |          }t          |          dk    r||d<   d|	v r|	d         \  }} }!}"}#||!z   | j        z   }| | j	        z
  }||z
  dz
  }|"| j	        z   | j
        z   }|                     |||||d	          }$t          j        dd|                     |$                    }%dt          |%          cxk    rdk    rn n|%|d<   d|	v r|	d         \  }} }!}"}#||!z   | j        z   }| | j	        z
  }||z
  dz
  }|"| j	        z   | j
        z   }|                     |||||          }$|                     |$          }&t          |&          dk    r|&|d<   d|	v r|	d         \  }} }!}"}#||!z   | j        z   }| | j	        z
  }||z
  dz
  }|"| j	        z   | j
        z   }|                     |||||          }$|                     |$          }'t          |'          dk    r|'|d<   d|	v r|	d         \  }(})}*}+}#d|	v r|	d         \  },}-}.}/}0|,|(|*z   z
  | j        z
  }n||(|*z   z
  dz
  }|(|*z   | j        z   }|)| j	        z
  }|+| j	        z   | j
        z   }|                     |||||          }$|                     |$          }1t          j        dd|1                                          }1t          |1          dk    r|1|d<   d|	v r|	d         \  }} }!}"}#||!z   | j        z   }| | j	        z
  }||z
  dz
  }|"| j	        z   | j
        z   }|                     |||||d	          }$t          j        d|$          }2|2rC|                     |2                    d                    }3|3                    dd          }3|3|d<   d|	v r{|	d         \  }} }!}"}#||!z   | j        z   }| | j	        z
  }||z
  dz
  }||z
  dz
  }|                     |||||          }$|                     |$          }4t          |4          dk    r|4|d<   t          |          dk    r|ndS )z+Extract voter data from a single cell imageNr   r  r   r   r   r   T)r   r?  rd   r  r   r   r   r!  r   r"  r   r#  r   r$  r   r   r%  r'  r(  ro   r)  r   )r4   r5   rY   rZ   rH   r7   r   rS   rX   r8   r9   r   r   r,  r   r-  r.  r:   r   r   r   r/  )5r;   r   r   r   r   r   r   r   r1  r   r   rJ   r   r   r   r   r   
name_scorer   r   r   r   rA  r4  r5  r   r   r   r   r6  r7  r   r   r   r   scorer   r8  r   r   r   r   r   r   r   dob_ydob_wdob_h	dob_scorer   r<  r   r   s5                                                        rA   extract_cellz CellsToCSVExtractor.extract_cellX  s   Z	**
4LS-?@@	#)"1"- 
 
 	'+~';';'='= 	1 	1#M8&&y(;;E 1+0	-( y>>Q4 Y9B69J6FFFFJH.HzH.1BBH"}}"66z8XW_aivz6{{!y)?MM 9!;;L<N<Nq<Q<QRRF)/aE+& Y9B69J6FFFFJvo
2Gt~-Gw&*Gt~-0AAG00WgwX_``I33J:!## *f ""8A*8M5GWgw'$*4G.Gw&*G.1BBG++J'SZgk+llDxT-K-KD-Q-QRRIS^^))))r)))))$-j! y  8A(8K5GWgw'$*4G.Gw&*G.1BBG++J'SZ[[D__T**F6{{a'-m$ y  8A(8K5GWgw'$*4G.Gw&*G.1BBG++J'SZ[[D__T**F6{{a'-m$ 9$$4=l4K1FFFFE	!!8A%8H5ueUI6F?3dj@ FVO4q8vo
2Gt~-Gt~-0AAG++J'SZ[[D..J	2z::@@BBJ:!##&0l# I8A%8H5GWgw'$*4G.Gw&*G.1BBG++J'SZgk+llD	"^`deeI -44Y__Q5G5GHHkk#s++),o& 	!!8A)8L5GWgw'$*4G.Gw&*Gw&*G++J'SZ[[Dood++G7||q  #*i E

QuuD0rT   c                    t          d t          j        |          D                       }g }| j        dk    r| j        s| j        dk    rS| j        rK| j        dk    r!t          dt          |           d           n t          dt          |           d           g }g }t          dt          |           d	           t          |d
          D ]\  }}t          j	        
                    ||          }	|                    dd                              d          }
t          |
d                             dd                    }t          |
d
                             dd                    }t          j        |	          }|0|                    ||||	f           |                    ||f           t          dt          |           d           | j        dk    r!t          dt          |           d           n(t          dt          |           d| j         d           t%          j                    }| j        dk    r|                     |          }n|                     |          }t%          j                    |z
  }t          d|ddt          |          |z  dd           t          |          }| j        dk    r|dz  }|dz  }d| j        j                                        v rd}d }| j        j         d!}n{d"| j        j                                        v rd#}d$}| j        j         d%}nGd&| j        j                                        v rd'}d(}| j        j         d)}nd#}d$}| j        j         d*}|d+z  |z  |d+z  |z  z   }| d,}n?|dk    r	|dz  d-z  }nd-|dz
  dz  d$z  z   }d.}|| j        z   d
z
  | j        z   d/| j         d0}t          d1           t          d2|            t          d3|            t          d4|d5d6           t          d7| d0           |D ]b\  }}|                     |||          }|rD|                    |           t          d8|d9d:|d;d<|                    d=d>          d?           cn|D ]}t          j	        
                    ||          }	|                    dd                              d          }
t          |
d                             dd                    }t          |
d
                             dd                    }|                     |	||          }|rD|                    |           t          d8|d9d:|d;d<|                    d=d>          d?           |rZt5          j        |          }g d@}|D ]}||j        vrd||<   ||         }|                    |dAdBC           t          |          S dS )DzDExtract data from all cell images in directory with batch processingc                     g | ]E}|                     d           r.d|                                v-d|                                vC|FS )r   	annotatedboxes)endswithr   )rk   fs     rA   rn   z9CellsToCSVExtractor.extract_all_cells.<locals>.<listcomp>  s`     
 
 
zz&!!
&1&B&BwVWV]V]V_V_G_G_ G_G_G_rT   r   r   z
  Processing z% cells with Gemini (full cell OCR)...z
  Batch processing z+ cells (full cell OCR for 100% accuracy)...z
  Loading z cell images...rd   r   r   rz   r   r  r  Nz	  Loaded z cell imagesz  Processing z cells with Gemini...z cells in batches of z...z  OCR completed in rp   zs (z cells/sec)i     z
flash-liteg?g?z): $0.10/1M input + $0.40/1M output tokensflashg333333?g333333?z): $0.15/1M input + $0.60/1M output tokensprog      ?g      @z): $1.25/1M input + $5.00/1M output tokensz5: $0.15/1M input + $0.60/1M output tokens (estimated)i@B z individual requestsg      ?zFGoogle Cloud Vision: $1.50/1000 images for first 1000, then $0.60/1000z batches (batch size: )z
  API Usage Summary:z    Total images processed: z    API calls made: z    Estimated cost: $z.4fz USDz    (z  Page 3dz Cell 2dr  r   zN/A30s)
r  r  r   r   r   r#  r$  r   r)  r   Fz	utf-8-sig)indexencoding)sortedr&   listdirr   r#   r$   r)   rX   r   r0   r1   r/  r+  intr4   r5   rv   r    rr   r  r  r   r   r=  r*  rQ  pd	DataFramecolumnsto_csv)r;   	cells_dir
output_csv
cell_files
all_votersr  cell_metadatar   	cell_filer   r:  r   r   r  
start_timecell_ocr_resultselapsedrw   input_tokensoutput_tokens
input_costoutput_costpricing_infocostapi_calls_infor1  dfcolumn_orderr  s                                rA   extract_all_cellsz%CellsToCSVExtractor.extract_all_cells  s&     
 
z),,
 
 
  

 
 Ox''D,>'DOW_D_D_dhduD_(**^J^^^____jc*oojjjkkk KM?s:???@@@"+J":": ? ?YGLLI>>	 "))&"55;;C@@uQx//;;<<uQx//;;<< !Z	22
)&&
Hh	'RSSS!(((H)=>>><c+..<<<=== (**Mc+&6&6MMMNNNNac+&6&6aaT_aaabbbJ(**#'#C#CK#P#P  #'#<#<[#I#I ikkJ.GaaaaK8H8H8Paaaabbb [))J(**  *D0 *S 0  4#4#9#?#?#A#AAA!%J"&K&*&7&<#g#g#gLL 1 6 < < > >>>!%J"&K&*&7&<#g#g#gLLd/4::<<<<!%J"&K&*&7&<#g#g#gLL "&J"&K&*&7&<#s#s#sL$y0J>-R[B[_jAjj$.!D!D!D %%&-5DDJ$5#="EEDg%/$/%AA%E$/$Y  "D  "Dqu  rA  "D  "D  "D+,,,===>>>999:::8$8888999),)))*** '4 f f"(001A8XVV f%%e,,,dHdddxdddeiiPVX]F^F^dddeeef ( f f	GLLI>>	 "))&"55;;C@@uQx//;;<<uQx//;;<<)))XxHH f%%e,,,dHdddxdddeiiPVX]F^F^dddeee  	j))Bc c cL $ ! !bj(( BsGL!B IIjIDDDr77NqrT   )r   Nr   r	   Nr
   )rC   )rd   )F)__name__
__module____qualname__rB   rS   rc   r{   r   r   r   r   r   r   r   r  r  r=  rI  rQ  rz  ri   rT   rA   r   r      s_       GK`rJ J J JX     $"4 "4 "4 "4HB B BH3 3 3j6 6 6 6p% % %  L@ L@ L@\; ; ; ;zA A AFD D DLV1 V1 V1pF1 F1 F1PL1 L1 L1\M M M M MrT   r   )__doc__r4   numpyr  PILr   r   pandasrd  r   r&   rr   collectionsr   r   ri   rT   rA   <module>r     s     


                   				 				  # # # # # #_ _ _ _ _ _ _ _ _ _rT   