
    ~Li4X                         d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlmZ ddlZddlmZ ddlZddlZddlZ G d d          Zd	 Zed
k    r e             dS dS )zt
Perfect Bengali Voter Extractor - Handles all edge cases
Cleans OCR errors, validates data, handles missing fields
    N)Path)ListDictOptionalTuple)convert_from_path)Imagec            
          e Zd Zd"dedefdZdej        dej        fdZdej        dee	e
e
f                  fdZdej        d	e
d
e
dej        fdZdej        defdZdedefdZdedefdZdedee         fdZdedee         fdZdedee         fdZdedeeeef                  fdZde
deeeef                  fdZd#de
dee
         deeeef                  fdZdeeeef                  d efd!ZdS )$PerfectVoterExtractorFpdf_pathdebugc                 h    t          |          | _        || _        ddddddddddddddd| _        d S )N u   জন্মu   কৃষকu	   মোঃ
   নাম:)BeatzKS)deoz{deoTRSWONNBTOROFABDEAu   জন্মুu   কৃষ্কu	   মৌঃu
   নাঈ:)r   r   r   	ocr_fixes)selfr   r   s      K/var/www/development/aibuddy-work/election-extract/extract_voter_perfect.py__init__zPerfectVoterExtractor.__init__   sT    X
 --$&
 
    imagereturnc                 V   t          j        t          j        |          t           j                  }t          j        |t           j                  }t          j        |dddd          }t          j        |ddd          }t          j        |dt           j	        t           j
        dd	          }t          j        d
t          j                  }t          j        |t           j        |          }t          j        dd          }	|	                    |          }
t#          j        |
          S )z%Enhanced preprocessing for better OCRN         	   K            )r(   r(   g      @)   r)   )	clipLimittileGridSize)cv2cvtColornparrayCOLOR_RGB2BGRCOLOR_BGR2GRAYfastNlMeansDenoisingbilateralFilteradaptiveThresholdADAPTIVE_THRESH_GAUSSIAN_CTHRESH_BINARYonesuint8morphologyExMORPH_CLOSEcreateCLAHEapplyr	   	fromarray)r   r   img_cvgraydenoised	bilateralthreshkernelmorphclaheenhanceds              r   preprocess_imagez&PerfectVoterExtractor.preprocess_image,   s    bhuoos/@AA|FC$677 +D$ArBB '!R<<	 &sC:r1
 
 )) &AA #EBBB;;u%%x(((r   c                    t          j        t          j        |          t           j                  }t          j        |t           j                  }|j        \  }}t          j        |dk     d          }t          |dz  d          }t          j	        |          |z  }t          j
        ||d          }	t          j        |	          dz  }
|	|
k     }g }d	}d}t          |          D ]7\  }}|r|s|}d
}|s'|r%||z
  |dz  k    r|                    ||f           d	}8t          |          dk    rz|d         d         }|d         d         }t          |          dk    r|d         d         n|dz  dz  }t          |          dk    r|d         d         n|dz  dz  }d|f||f||fg}n|dz  }d|f|d|z  fd|z  |fg}| j        rt!          d|            |S )zDetect column boundaries   r   )axisd      same)modeg333333?FT   r(         z  Detected columns: )r,   r-   r.   r/   r0   r1   shapesummaxr7   convolvemean	enumerateappendlenr   print)r   r   r>   r?   heightwidthvertical_projectionkernel_sizerC   smoothed	threshold	is_valleyvalleys	in_valleystartivalleycol1_end
col2_startcol2_end
col3_startcolumns	col_widths                          r   detect_columnsz$PerfectVoterExtractor.detect_columnsG   s1   bhuoos/@AA|FC$677
 fTCZa888 %3,**%%3;2FHHH GH%%+	y(		"9-- 	" 	"IAv "i " 		 "	 "u9u{**NNE1:...!	 w<<1qz!}H AJ(+Gq(8(8wqz!}}eai1nH*-g,,*:*:A	QJHX&U#GG 
IIA	M*Y&G : 	4222333r   x_startx_endc                 n    t          j        |          }|dd||f         }t          j        |          S )zExtract column from imageN)r.   r/   r	   r=   )r   r   rn   ro   	img_arraycolumns         r   extract_column_imagez*PerfectVoterExtractor.extract_column_image}   s6    HUOO	111gem+,v&&&r   column_imagec                 `    |                      |          }d}t          j        ||          }|S )zOCR with better configz--oem 3 --psm 6 -l ben+eng)config)rG   pytesseractimage_to_string)r   rt   	processedcustom_configtexts        r   
ocr_columnz PerfectVoterExtractor.ocr_column   s4    )),77	5*9]KKKr   r{   c                 j    |s|S t                               dd          }|                    |          S )z#Convert Bengali numerals to Englishu   ০১২৩৪৫৬৭৮৯
0123456789)str	maketrans	translate)r   r{   transs      r   bengali_to_english_numberz/PerfectVoterExtractor.bengali_to_english_number   s5     	K>MM~~e$$$r   c                    |s|S |}| j                                         D ]\  }}|                    ||          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }|                    d	d          }|                                S )
zClean common OCR errorsz[!@#$%^&*|\\<>{}[\](){}"\'`~]r   z\b[A-Za-z]{1,3}\b\s+ z\s+\d{1,2}\s+z[,.\-:;_]+$z^[,.\-:;_]+_)r   itemsreplaceresubstrip)r   r{   cleanederrorfixs        r   clean_ocr_errorsz&PerfectVoterExtractor.clean_ocr_errors   s     	K...00 	2 	2JE3ooeS11GG &92wGG &-r7;; &g.. &)388 &W55&W55 //#s++}}r   vidc                     |sdS |                      |          }t          j        dd|          }dt          |          cxk    rdk    rn n|S dS )zValidate and clean voter IDNz[^\d]r   
      )r   r   r   rY   )r   r   	vid_cleans      r   validate_voter_idz'PerfectVoterExtractor.validate_voter_id   sn     	4 ,,S11 F8R--	 Y%%%%2%%%%%tr   namec                    |rt          |          dk     rdS t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd|          }t          j        dd	|          }t          j        d
d	|          }t          j        dd|          }|                     |          }t          j        dd	|                                          }t          j        dd|                                          }t          j        d|          sdS t          |          dk     st          |          dk    rdS |S )zValidate and clean namerQ   Nz^\d{4}\.?\s*r   u   [০-৯]{4}\.?\s*z\b[A-Z]{2,}\bz\b[A-Za-z]{4,}\bz	\s*\d+\s*r   u   \s*[০-৯]+\s*u"   [°¥±÷×§¶†‡•◦▪▫]r   z\.+$[\u0980-\u09FF]rK   )rY   r   r   r   r   search)r   r   s     r   validate_namez#PerfectVoterExtractor.validate_name   sG    	s4yy1}}4 vor400v+R66 v&D11v)2t44 vlC..v)355 v;RFF $$T**vfc4((..00 vgr4((..00 y+T22 	4t99q==CIIOO4r   date_strc                 T   |sdS |                      |          }t          j        d|          }|rw|                                \  }}}t	          |          t	          |          t	          |          }}}d|cxk    rdk    r%n n"d|cxk    rdk    rn nd|cxk    rdk    rn n|S dS )zValidate date formatNz^(\d{2})/(\d{2})/(\d{4})$rP      r!   il  i  )r   r   matchgroupsint)r   r   r   daymonthyears         r   validate_datez#PerfectVoterExtractor.validate_date   s     	4 11(;; 5x@@ 	 $||~~C"3xxUSYYC C~~~~2~~~~~!u"2"2"2"2"2"2"2"2"2tt7K7K7K7Kt7K7K7K7K7Ktr   c                 	   i }	 t          j        d|          }|r/|                     |                    d                    }|r||d<   t          j        d|t           j                  }|r/|                     |                    d                    }|r||d<   |                     |          }t          j        d|t           j                  }|s t          j        d|t           j                  }|r@|                     |                    d                    }|                    d          |d	<   t          j        d
|t           j                  }	|	rt|	                    d          	                                }
t          j
        dd|
          }
t          j        d|
          d         }
|                     |
          }
|
r|
|d<   t          j        d|t           j                  }|rt|                    d          	                                }t          j
        dd|          }t          j        d|          d         }|                     |          }|r||d<   t          j        d|t           j                  }|r|                    d          	                                }t          j
        dd|          }t          j        d|          d         }t          j        d|          d         }|                     |          }t          j
        dd|          	                                }t          j
        dd|          }t          j        d|          r"dt          |          cxk    rdk    rn n||d<   t          j        d|          }|r/|                     |                    d                    }|r||d<   t          j        d |t           j                  }|r|                    d          	                                }t          j        d!|          d         }|                     |          }t          j
        dd|          	                                }t          j        d|          r"d"t          |          cxk    rd#k    rn n||d$<   d|v o|d         }d|v od	|v }|s|r|S n0# t          $ r#}| j        rt!          d%|            Y d&}~nd&}~ww xY wd&S )'z"Parse single voter with validationuX   (?:ভোটার\s*নং|র\s*নং|J\s*(?:12|নং))[:\s]*([০-৯0-9\s]{10,20})rP   voter_idus   (?:নাম|লাম)[:\s]*([^\n]+?)(?=\s*ভোটার|\s*র\s*নং|\s*J\s*(?:12|নং)|\s*পিতা|$)r   u5   ([০-৯0-9]{1,4})\s*[.,]?\s*(?:নাম|লাম)u!   ^[^\d]*([০-৯0-9]{1,4})\s*[.,]   	serial_nouF   পিতা[:\s]*([^\n]+?)(?=\s*মাতা|\s*ঠিকানা|$)uB   (পিতা|মাতা|ঠিকানা|পেশা)[:\s]*$r   u6   \s+(মাতা|ঠিকানা|পেশা)[:\s]r   father_nameuF   মাতা[:\s]*([^\n]+?)(?=\s*ঠিকানা|\s*পেশা|$)u6   \s+(ঠিকানা|পেশা|জন্ম)[:\s]mother_nameuF   ঠিকানা[:\s]*([^\n]+?)(?=\s*পেশা|\s*জন্ম|$)ui   (পেশা|জন্ম|ঠিকানা|ভোটার|নাম|পিতা|মাতা)[:\s]*$uJ   \s+(পেশা|জন্ম|ভোটার|নাম|পিতা)[:\s]   [০-৯0-9]{4}\.r   r   u"   (তারিখ|fear|fost|WON).*$r   rL      addressuF   তারিখ[:\s]*([০-৯0-9]{2}/[০-৯0-9]{2}/[০-৯0-9]{4})date_of_birthuC   পেশা[:\s]*([^,\n]+?)(?=,|জন্ম|তারিখ|\s*$)u#   (জন্ম|তারিখ)[:\s]r(   2   
professionz  Parse error: N)r   r   r   group	MULTILINEr   r   r   zfillr   r   splitrY   r   	Exceptionr   rZ   )r   r{   voter	vid_matchr   
name_matchr   serial_matchserialfather_matchfathermother_matchmother
addr_matchaddr	dob_matchdob
prof_matchprofhas_namehas_id_seriales                         r   parse_voterz!PerfectVoterExtractor.parse_voter   s   h	-	"}  @D  E  EI ,,,Y__Q-?-?@@ ,(+E*%   $Z  \`  bd  bn  o  oJ )))**:*:1*=*=>> )$(E&M ((..D 9%]_cegeqrrL c!y)MtUWUabb 5778J8J18M8MNN%+\\!__k" 9%nptvx  wC  D  DL 2%++A..4466 egikqrr"[]cddefg++F33 2+1E-( 9%nptvx  wC  D  DL 2%++A..4466 egikqrr"[]cddefg++F33 2+1E-( #lnrtv  uA  B  BJ ,!''**0022v  K  MO  QU  V  Vx mosttuvw x 4d;;A>,,T22vfc4006688 vCRNN 9/66 ,1D		;P;P;P;PS;P;P;P;P;P'+E)$ 	"kmqrrI 1((););<< 1-0E/* #ikoqsq}~~J /!''**0022 x FMMaP,,T22vfc4006688 9/66 /1D		;O;O;O;OR;O;O;O;O;O*.E,' 85=H&%/HK54HM =   	- 	- 	-z -+++,,,	- ts   R2R8 8
S%S  S%page_numc                    t          d| d           t          | j        d||          }|st          d           g S |d         }|                     |          }g }t	          |          D ]#\  }\  }}| j        rt          d|dz    d	           |                     |||          }	|                     |	          }
|
rt          |
          d
k     r| j        rt          d           ~t          j
        d|
          }t          |          dk     rt          j
        d|
          }t          |          dk     rt          j
        d|
          }t          |          dk     rt          j
        d|
          }d}|D ]}|                                }t          |          dk     r+t          j        d|          }t          j        d|          }d|v pd|v }|s|sbd|v }t          j        d|          }d|v pd|v }d|v }d|v ot          j        d|          }|rN|rL|sJ|sH| j        r@t          j        d|          }|r|                    d          nd}t          d|            |                     |          }|r|                    d           sL|                    d!d"          t!          fd#d$D                       r| j        rt          d%            h||d&<   |dz   |d'<   |r|                    d!          rd(|d)<   |                    |           |dz  }| j        rF|rd*nd"}t          d+|                    d!d           d,|                    d d           d-|            | j        rt          d.| d/           %t          d0t          |           d1t          |           d2           |S )3zExtract voters from pagezProcessing page z...i,  )dpi
first_page	last_pageu     ❌ No image extractedr   z

  Column rP   :r   z    No sufficient textu/   (?=[০-৯0-9]{4}\.\s*(?:নাম|লাম))r(   u=   (?=[০-৯0-9১-৯>এ]\s*[.,]?\s*(?:নাম|লাম))u   (?=(?:নাম|লাম):)u;   (?=ভোটার\s*নং|র\s*নং|J\s*(?:12|নং))   u   [০-৯0-9]{1,4}\.u;   (?:ভোটার\s*নং|র\s*নং|J\s*(?:12|নং))u	   নামu	   লামu+   মাইগ্রেট হয়েছেr   u   মাইগ্রেটu   হয়েছেr   u   ভোটারu   [০-৯0-9]{10,}u   ([০-৯0-9]{4})\.zN/Au        ⊗ Skipped migrated entry: r   r   r   c              3       K   | ]}|v V  	d S )N ).0wordr   s     r   	<genexpr>zAPerfectVoterExtractor.extract_voters_from_page.<locals>.<genexpr>  s?        I  Itt|  I  I  I  I  I  Ir   )u   কান্দিu   বাজারu   গ্রামu   এলাকাu       ⊗ Skipped place name: page_numberrr   Yesmigratedz [MIGRATED]u       ✓ z (ID: )z    Extracted: z votersu     ✓ Extracted z voters from z columns)rZ   r   r   rm   rW   r   rs   r|   rY   r   r   r   r   r   r   getanyrX   )r   r   imagesr   rk   voterscol_idxrn   ro   	col_imager{   parts
col_voterspart	part_text
has_serialhas_voter_patternhas_name_fieldis_migratedhas_migratedr   has_voter_idr   r   r   migrated_markr   s                             @r   extract_voters_from_pagez.PerfectVoterExtractor.extract_voters_from_pagei  s   ..../// #Ms8
 
 

  	,---Iq	 %%e,,)27);); ^	= ^	=%G%guz 42GaK222333 11%%HHI ??9--D 3t99r>>: 53444 HOQUVVE 5zzA~~ !acghh 5zzA~~!@$GG 5zzA~~!_aeffJ 9y 9y JJLL	 y>>B&&  Y'=yII
$&I.lnw$x$x!!,	!9!U[I=U ' *;  LyX  Y';YGG
9YFkJ^bkJk'940I=l")L`bkBlBl  ,  L z K')y1G'S'S:F!Q!3!3A!6!6!6EIIIJJJ((33 y 99Z00 %$yy44  I  I  I  I  9H  I  I  I  I  I %#z M %&KT&K&K L L L$+3E-(&-kE(O # 2uyy'8'8 2,1j)MM%(((!OJz y9D(L"w65)A)AwwS]_dIeIewwhuwwxxxz =;
;;;<<<QVQQ3w<<QQQRRRr   rP   N
start_pageend_pagec                    t          j        | j                  }t          |          }|                                 ||}nt          ||          }t          dd            t          d           t          d| j        j                    t          d| d|            t          d d           g }t          ||dz             D ]:}| 	                    |          }|
                    |           t                       ;t          d            t          d	t          |                      t          d d           |S )
zExtract from all pagesN
zF======================================================================z  Perfect Voter Extractionz  File: z	  Pages: z to rP   u     ✓ Total voters extracted: )fitzopenr   rY   closeminrZ   r   ranger   extend)r   r   r   doctotal_pages
all_votersr   r   s           r   extract_all_votersz(PerfectVoterExtractor.extract_all_voters  sT   i&&#hh		"HH8[11Hm6mm+,,,-+--...4*44(44555mmm
j(Q,77 	 	H228<<Ff%%%GGGGk@s:@@AAAmmmr   r   output_pathc                    |st          d           dS t          j        |          g d}fd|D             }|         fddD             }|r                    |                              |dd	           t          d
|            t          dt                                t          dj        v r/dd                                                                          nd           t          dj        v r/dd                                                                          nd           t          dj        v r/dd                                                                          nd           dS )zSave to CSV with validationu   ⚠️  No voters to save!N)r   rr   r   r   r   r   r   r   r   r   r   c                 &    g | ]}|j         v |S r   rk   )r   coldfs     r   
<listcomp>z5PerfectVoterExtractor.save_to_csv.<locals>.<listcomp>
  s%    @@@CcRZ.?.?C.?.?.?r   c                 &    g | ]}|j         v |S r   r   )r   cr   s     r   r   z5PerfectVoterExtractor.save_to_csv.<locals>.<listcomp>  s     ZZZ1!rz//Q///r   )r   rr   r   Fz	utf-8-sig)indexencodingu   ✓ Saved to: u   ✓ Total records: r   u   ✓ Records with voter ID: r   r   u   ✓ Records with name: r   u   ✓ Records with DOB: )	rZ   pd	DataFramesort_valuesto_csvrY   rk   notnarS   )r   r   r   rk   existing	sort_colsr   s         @r   save_to_csvz!PerfectVoterExtractor.save_to_csv  s    	.///F\&!!> > > A@@@7@@@\ [ZZZ FZZZ	 	+	**B 			+U[	AAA 	,{,,----CGG--...j\^\fNfNfJBzN,@,@,B,B,F,F,H,HJJJlnooofPRPZFZFZB6
(8(8(:(:(>(>(@(@BBB`bcccoacakNkNkJr/':'@'@'B'B'F'F'H'HJJJqstttttr   )F)rP   N)__name__
__module____qualname__r   boolr   r	   rG   r   r   r   rm   rs   r|   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r      s       
 
 
T 
 
 
 
,)ek )ek ) ) ) )64EK 4DsCx4I 4 4 4 4l'%+ ' 'C 'TYT_ ' ' ' 'u{ s    %c %c % % % %S S    <S Xc]    ""# "(3- " " " "Hc hsm    (n nc3h(@ n n n n`u ud38n9M u u u un S  Y]^bcfhkck^lYm    :u$tCH~"6 uS u u u u u ur   r   c                     dd l } |                     d| j        d          }|                    dd           |                    dd	d
d           |                    ddt          dd           |                    ddt          d           |                    dddd           |                                }t          |j                                                  s+t          d|j                    t          j        d           	 t          |j        |j                  }|                    |j        |j                  }|                    ||j                   d S # t&          $ r' t          d           t          j        d           Y d S t(          $ rP}t          d|            |j        rdd l}|                                 t          j        d           Y d }~d S d }~ww xY w)Nr   z%Perfect Bengali Voter Data Extractiona  
Examples:
  # Extract all pages
  python extract_voter_perfect.py voter_list.pdf

  # Specific pages with debug
  python extract_voter_perfect.py voter_list.pdf -s 3 -e 10 -d

  # Custom output
  python extract_voter_perfect.py voter_list.pdf -o clean_data.csv
        )descriptionformatter_classepilogpdf_filezPDF file path)helpz-oz--outputzvoter_data_perfect.csvzOutput CSV file)defaultr  z-sz--start-pagerP   z
Start page)typer  r  z-ez
--end-pagezEnd page)r  r  z-dz--debug
store_truez
Debug mode)actionr  u   ❌ Error: File not found: u   

⚠️  Interrupted by useru   
❌ Error: )argparseArgumentParserRawDescriptionHelpFormatteradd_argumentr   
parse_argsr   r  existsrZ   sysexitr   r   r   r   r   r  outputKeyboardInterruptr   	traceback	print_exc)r  parserargs	extractorr   r   r%  s          r   mainr*    s:   OOO$$; <
 %  F  
999
j2J-  / / /
n3(  * * *
l&  ( ( (
i(  * * * D%%'' ;DM;;<<<)$-DD	--dot}MMfdk22222   /000   !a!!""": 	"!!!s    AE% %-G.	G.AG))G.__main__)__doc__r   r!  pathlibr   typingr   r   r   r   r   	pdf2imager   rw   PILr	   r,   numpyr.   pandasr  r   r*  r  r   r   r   <module>r3     s    
 
			 



       . . . . . . . . . . . .  ' ' ' ' ' '           



        Eu Eu Eu Eu Eu Eu Eu EuP/ / /d zDFFFFF r   