
    Bi>                        d dl Z d dlZd dlZ	 ddlmZ n# e$ r d dlZY nw xY w	 ddlmZ n# e$ r d dlZY nw xY wej        Zej	        Z	dZ
dZdZdZ	 ej        Zn# e$ r eez  ez  ZY nw xY wej        Zej        edf         Zej        e         Zej        e         Zej        e         Zej        e         Zej        ej                  Z!	 	 	 	 	 dDd
ej"        dededej#        de$de%fdZ&	 	 	 	 	 	 dEd
ej"        dededej#        de$de%fdZ'	 	 	 	 dFd
ej"        dededej#        def
dZ(	 dGd
ej"        dedej#        defdZ)	 	 dHd
ej"        de
de
dedej#        f
dZ*	 	 	 	 	 dId
ej"        dededede$dedej#        fdZ+	 dJdddd	ddd!d
ej"        d"edededej#        de$fd#Z,dGdefd$Z-d%ed&edefd'Z.d
ej"        d(edefd)Z/de%fd*Z0de%fd+Z1d,ede2fd-Z3d,ede2fd.Z4d/ej5        d%ede2fd0Z6d1 Z7	 d2 Z8d3 Z9defd4Z:defd5Z;d6edefd7Z<d8e2d9ed:e2dej=        fd;Z>d8e2d9edej=        fd<Z?dGd=ed>e%dej=        fd?Z@dGd8e2d9ed@e%dej=        fdAZAd8e2d9edBedej=        fdCZBdS )K    N   )pymupdf)mupdf
point_like	rect_likematrix_like	quad_likeFpageclipflagstextpagesortreturnc                 (   t          j        |            |t           j        }|}||                     ||          }n#t	          |d          | k    rt          d          |                                }|~|r|                    d            |S )a_  Return the text blocks on a page.

    Notes:
        Lines in a block are concatenated with line breaks.
    Args:
        flags: (int) control the amount of data parsed into the textpage.
    Returns:
        A list of the blocks. Each item contains the containing rectangle
        coordinates, text lines, running block number and block type.
    Nr   r   parentnot a textpage of this pagec                 "    | d         | d         fS N   r    )bs    e/var/www/development/aibuddy-work/election-extract/venv/lib/python3.11/site-packages/pymupdf/utils.py<lambda>z!get_text_blocks.<locals>.<lambda>R   s    1Q41,     key)r   CheckParentTEXTFLAGS_BLOCKSget_textpagegetattr
ValueErrorextractBLOCKSr   )r
   r   r   r   r   tpblockss          r   get_text_blocksr&   4   s    " }(	B	zD66	X		$	&	&6777F 0..///Mr   r   c                 p   fd}t          j        |            |t           j        }|}||                     |          }n#t	          |d          | k    rt          d          |                    |          }	|$"t          j                  fd|	D             }	|~|	r|r ||	          }	|	S )a  Return the text words as a list with the bbox for each word.

    Args:
        page: pymupdf.Page
        clip: (rect-like) area on page to consider
        flags: (int) control the amount of data parsed into the textpage.
        textpage: (pymupdf.TextPage) either passed-in or None.
        sort: (bool) sort the words in reading sequence.
        delimiters: (str,list) characters to use as word delimiters.
        tolerance: (float) consider words to be part of the same line if
            top or bottom coordinate are not larger than this. Relevant
            only if sort=True.

    Returns:
        Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
    c                 R   |                      d            g }| d         g}t          j        | d         dd                   }| dd         D ]}t          j        |dd                   }t          |j        |j        z
            k    s t          |j        |j        z
            k    r|                    |           ||z  }y|                     d            |                    |           |g}|}|                     d            |                    |           |S )	z1Sort words line-wise, forgiving small deviations.c                 "    | d         | d         fS r   r   ws    r   r   z4get_text_words.<locals>.sort_words.<locals>.<lambda>r   s    !A$! r   r   r   N   r   c                     | d         S Nr   r   r*   s    r   r   z4get_text_words.<locals>.sort_words.<locals>.<lambda>   s
    ! r   c                     | d         S r.   r   r*   s    r   r   z4get_text_words.<locals>.sort_words.<locals>.<lambda>   s
    ! r   )r   r   Rectabsy0y1appendextend)wordsnwordslinelrectr+   wrect	tolerances         r   
sort_wordsz"get_text_words.<locals>.sort_wordsp   s)   

--
...azU1Xbqb\**qrr 	 	AL2A2''EEHux'((I55ux%(*++y88A		nn	---d###s		nn	%%%dr   Nr   r   r   c                     g | ]L}t          |d d         z            dt          t          j        |d d                             z  k    J|MS )Nr,   g      ?)r1   r   r0   ).0r+   r   s     r   
<listcomp>z"get_text_words.<locals>.<listcomp>   s]     
 
 
D1RaR5L 1 1S3w|AbqbE?R?R;S;S5S S SA S S Sr   )r   r   TEXTFLAGS_WORDSr    r!   r"   extractWORDSr0   )
r
   r   r   r   r   
delimitersr;   r<   r$   r6   s
    `    `   r   get_text_wordsrC   V   s   4    2 }'	B	zD66	X		$	&	&6777OOJ''E  0|D!!
 
 
 

 
 
  " "
5!!Lr   c           	         d }d t          | |||d|          D             }|sdS t          j                    }|D ]
\  }}	||z  }g }
|d         g}|d         d         }|dd         D ]\  }}	|d	         \  }}t          |j        |j        z
            |k    s t          |j        |j        z
            |k    r|                    ||	f           ||z  }m |||          }|
                    ||f           ||	fg}|} |||          }|
                    ||f           |
                    d
            |
d         d         }	|
d         d         j        }|
dd         D ]V\  }}t          t          t          |j        |z
  |j        z                      d          }d|dz   z  }|	||z   z  }	|j        }W|	S )a  Extract plain text avoiding unacceptable line breaks.

    Text contained in clip will be sorted in reading sequence. Some effort
    is also spent to simulate layout vertically and horizontally.

    Args:
        page: pymupdf.Page
        clip: (rect-like) only consider text inside
        flags: (int) text extraction flags
        textpage: pymupdf.TextPage
        tolerance: (float) consider words to be on the same line if their top
            or bottom coordinates do not differ more than this.

    Notes:
        If a TextPage is provided, all text is checked for being inside clip
        with at least 50% of its bbox.
        This allows to use some "global" TextPage in conjunction with sub-
        selecting words in parts of the defined TextPage rectangle.

    Returns:
        A text string in reading sequence. Left indentation of each line,
        inter-line and inter-word distances strive to reflect the layout.
    c                 n   |                     d            d}| j        }t          j                    }|D ]~\  }}||z  }t	          t          t          |j        |z
  |j        z  t          |          z                      || j        k    s|j        |k    rdnd          }|d|z  |z   z  }|j	        }|S )a  Create the string of one text line.

        We are trying to simulate some horizontal layout here, too.

        Args:
            clip: (pymupdf.Rect) the area from which all text is being read.
            line: (list) word tuples (rect, text) contained in the line
        Returns:
            Text in this line. Generated from words in 'line'. Distance from
            predecessor is translated to multiple spaces, thus simulating
            text indentations and large horizontal distances.
        c                     | d         j         S r.   )x0r*   s    r   r   z4get_sorted_text.<locals>.line_text.<locals>.<lambda>   s    ! r   r    r   r    )
r   rG   r   
EMPTY_RECTmaxintroundwidthlenx1)r   r8   ltextrP   r9   rtdists           r   	line_textz"get_sorted_text.<locals>.line_text   s     			''	(((W"$$ 		 		DAqQJEE14"9/#a&&899::DGmmqtrzz D
 S4Z!^#EBBr   c                 V    g | ]&}t          j        |d d                   |d         f'S )Nr,   )r   r0   )r>   r+   s     r   r?   z#get_sorted_text.<locals>.<listcomp>   sC     
 
 
 
ae		ad#
 
 
r   T)r   r   r   r   r;   rH   r   r   Nc                     | d         j         S r.   )r3   )ls    r   r   z!get_sorted_text.<locals>.<lambda>  s    adg r   r      
)rC   r   rJ   r1   r2   r3   r4   r   minrL   rM   height)r
   r   r   r   r;   rU   r6   totalboxwrtextlinesr8   r9   w0r_rQ   r3   distancebreakss                      r   get_sorted_textrf      s9   >  <
 

 
 

 
 
E  r!##H  DBE!H:D!HQKE !""I  DbQ ux"%  I--UX5E1F1F)1S1SKKT
###RKEE Ih--ELL%(((J<DEE Ih%%E	LL%    
JJ&&J'''8A;D	q!Babb	  us5%(R-5<!?@@AA1EEA&X Kr   rectc                     |}||                                  }n#t          |d          | k    rt          d          |                    |          }|~|S )Nr   r   )r    r!   r"   extractTextbox)r
   rg   r   r$   rcs        r   get_textboxrk     sf    
 
B	z  	X		$	&	&6777			4	 	 BIr   p1p2c                     t          j        |            |}|"|                     |t           j                  }n#t	          |d          | k    rt          d          |                    ||          }|~|S )Nr   r   r   )r   r   r    TEXT_DEHYPHENATEr!   r"   extractSelection)r
   rl   rm   r   r   r$   rj   s          r   get_text_selectionrq   *  s     	B	zD0HII	X		$	&	&6777			R	$	$BIr   engH   languagedpifulltessdatac                 "   t          j        |            t          j                  fd}|r || |||          S |                     |          }|                     dt           j                  d         D ]}|d         dk    rt          j        |d                   }	|	j        dk    s|	j        dk    rA	 t          j	        |d	                   }
|
j
        |
j        z
  dk    rt          j	        t           j        |
          }
|
j        rt          j	        |
d
          }
t          j        d|
                    |                    }|                    d
          }d}
|j        }t          j        d|j        z  d|j        z            }||d         z  }|                    |d
|           |                                 P# t(          t*          j        f$ r* 	 d}t          j        d            || |||          cY c S w xY w|S )as  Create a Textpage from combined results of normal and OCR text parsing.

    Args:
        flags: (int) control content becoming part of the result.
        language: (str) specify expected language(s). Default is "eng" (English).
        dpi: (int) resolution in dpi, default 72.
        full: (bool) whether to OCR the full page image, or only its images (default)
    c                    |dz  }t          j        ||          }|                     |          }t          j        d|                    d|                    }|                    d          }| j        j        |j        j        z  }	t          j        |	|	          | j        z  }
|	                    ||
          }|
                                 d }t          j        |           |_        |S )Nrs   )matrixpdfF)compressrt   rw   r   r   rz   )r   Matrix
get_pixmapDocumentpdfocr_tobytes	load_pagerg   rN   derotation_matrixr    closeweakrefproxyr   )r
   ru   rt   r   zoommatpixocr_pdfocr_pageunzoomctmtpagerw   s               r   full_ocrz"get_textpage_ocr.<locals>.full_ocrP  s    RxnT4((ooSo))""""%% #    $$Q''8=#66nVV,,t/EE%%E#%>>}T**r   )r   dictr%   typer   bboxr   imager   r{   )rt   rw   N	transformr}   zFalling back to full page OCR)r   r   get_tessdatar    get_textTEXT_PRESERVE_IMAGESr0   rN   r]   PixmapnalphacsRGBr   r   r   rg   r~   extend_textpager   RuntimeErrorr   FzErrorBaseg_exceptions_verboseexception_infomessage)r
   r   rt   ru   rv   rw   r   r   blockr   r   imgdocimgpageimgrectshrinkr   s        `          r   get_textpage_ocrr   =  sD     #H--H    ,  4xc8U333
 E**EvW-IJJ8T 8 8=A|E&M**:??dkQ..	8.w00Cusy A%%nW]C88y -nS!,,%&&8&LL F &&q))GClG^A$5q7>7IJJF5--C##E3#???LLNNNNe/0 	8 	8 	8) EO;<<<8D#x7777777	8 Ls   >DG;HHr`   )r   r   r   r   rB   r;   optionc                   t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        t           j        d
}|	                                }||v sJ ||vrd}|||         }|dk    rt          | |||||          S |dk    rt          | ||||          S |dk    r|rt          | ||||          S t          j        |            d}	|d	v r| j        }|t          j        |          }d}	n"t!          |           t           j        u r| j        }	|}
|
|                     ||
          }
n#t'          |
d          | k    rt)          d          |dk    r|
                    |	|          }n|dk    r|
                    |	|          }n|dk    r|
                    |	|          }n|dk    r|
                    |	|          }ng|dk    r|
                                }nL|dk    r|
                                }n1|dk    r|
                                }n|
                    |          }|~
|S )a  Extract text from a page or an annotation.

    This is a unifying wrapper for various methods of the pymupdf.TextPage class.

    Args:
        option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
        clip: (rect-like) restrict output to this area.
        flags: bit switches to e.g. exclude images or decompose ligatures.
        textpage: reuse this pymupdf.TextPage and make no new one. If specified,
            'flags' and 'clip' are ignored.

    Returns:
        the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
        methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
        extractXHTML or etractXML respectively.
        Default and misspelling choice is "text".
    )
r`   htmljsonrawjsonxmlxhtmlr   rawdictr6   r%   r`   Nr6   )r   r   r   r   rB   r%   )r   r   r   r   )r   r   r   r;   )r   r   r   r   r   r   r   )cbr   r   r   r   r   r   r   )r   )r   TEXTFLAGS_TEXTTEXTFLAGS_HTMLTEXTFLAGS_DICTTEXTFLAGS_RAWDICTTEXTFLAGS_XMLTEXTFLAGS_XHTMLr@   r   lowerrC   r&   rf   r   cropboxr0   r   Pager    r!   r"   extractJSONextractRAWJSONextractDICTextractRAWDICTextractHTML
extractXMLextractXHTMLextractText)r
   r   r   r   r   r   rB   r;   formatsr   r$   rS   s               r   r   r     s   : &&&,$(&,(* G \\^^FWW}!
 
 
 	
 t58$
 
 
 	
 D
 
 
 	
 	B)))||D!!	dw|	#	#\	B	zD66	X		$	&	&6777NNbtN,,	9		$//	6		NNbtN,,	9		$//	6		NN	5MMOO	7		OONNN%%Hr   c                    t          | t          j                  r|                     |          }n=t          | t          j                  r| j        }nJ dt          |           d            |j        dd}	 t          | d          r
| j	        |d<   n.# t          $ r! t          dk    rt          j                     Y nw xY wt          j        dd          }|j        t          j        z  r|j        j        |_        |j        t          j        z  r|j        j        |_        |j        t          j        k    r|j        |d<   n|j        t          j        k    r;|j        |d	<   ||d
<   |j        t          j        z  r|j        j        |d<   nHd|d<   nA|j        t          j        k    rm|j                            dd          |d<   |j        |d	<   |j        dk     r|j        |d
<   n||d
<   |j        t          j        z  r|j        j        |d<   nd|d<   n|j        t          j        k    r|j                            dd          |d<   n|j        t          j        k    rl|j                                         |                                 z  rJ |!                    |j                   d
|v rt          j        |d
                   |d
<   n
|j        |d	<   |S )Nr   zUnexpected type(ln)=.)kindxrefrg   from   urir
   tor   g        \/file)"
isinstancer   OutlinedestinationLinkdestr   r   hasattrrg   	Exceptionr   r   Pointr   LINK_FLAG_L_VALIDltxLINK_FLAG_T_VALIDyLINK_URIr   	LINK_GOTOr
   LINK_FLAG_R_IS_ZOOMrb
LINK_GOTOR	file_specreplaceLINK_LAUNCH
LINK_NAMEDnamedkeysupdate)lndocumentr   nlpnts        r   getLinkDictr     s   "go&& -~~h''	B	%	% -w,,R,,,,,,)Q	'	'B2v 	!BvJ   1$$(>(@(@(@ -1

CzG-- 	zG-- 	yG$$$H5			g'	'	'Y6
4:33 	BvJJBvJJ	g(	(	(^++D#666
Y6
9q==yBtHHBtHzG77 !!WY6

 6

	g)	)	)^++D#666

	g(	(	(JOO%%		1222
		$*2::}RX..BtH Y6
Is   9B (B?>B?r   ddictc                 V   |sdS d }d }d }d }d }t          |          t          t          fv r || d|d          }|S |                    dt          j                  }|t          j        k    rdS |d         t          j        k    rT|                    d	d          }	|                    d
t	          j        dd                    }
|
\  }} || |||	          }|S |d         t          j        k    r% |t	          j	        |d                             }|S |d         t          j
        k    r(t	          j	        |d                   } |||          }|S |d         t          j        k    rM|d         dk     rAt	          j	        |d                   } |t	          j	        |d
                   ||          }|S |d         t          j        k    rZ|d         dk    rNt	          j	        |d                   } ||d         |d
         j        |d
         j        |d	         ||          }|S dS )zrCalculate the PDF action string.

    Notes:
        Supports Link annotations and outline items (bookmarks).
    rH   c                 4    d|  dt          |||f           dS )Nz/A<</S/GoTo/D[z	 0 R/XYZ z]>>	_format_g)ar   cds       r   r   zgetDestStr.<locals>.<lambda>A  s)    "X1"X"Xy!QPQ?S?S"X"X"X r   c           	      @    d|  dt          |||f           d| d| d	S )Nz/A<</S/GoToR/D[z /XYZ z]/F<</F/UF/Type/Filespec>>>>r   )r   r   r   r   efs         r   r   zgetDestStr.<locals>.<lambda>B  s=    *}A*}*}YPQSTVWyEYEY*}*}bc*}*}hi*}*}*} r   c                     d|  d| d| dS )Nz/A<</S/GoToR/Dz/F<</Fr   r   r   )r   r   r   s      r   r   zgetDestStr.<locals>.<lambda>C  s"    !V!!V!V1!V!V!V!V!V r   c                     d|  d| dS )Nz/A<</S/Launch/F<</Fr   r   r   )r   r   s     r   r   zgetDestStr.<locals>.<lambda>D  s    OAOO!OOO r   c                     d|  dS )Nz/A<</S/URI/URIz>>r   )r   s    r   r   zgetDestStr.<locals>.<lambda>E  s    .... r   r   r   r   r   r   r   r
   )r   rL   floatgetr   	LINK_NONEr   r   r   get_pdf_strr   r   r   r   )r   r   str_goto
str_gotor1
str_gotor2
str_launchstr_urir   d_kindd_zoomr   d_leftd_topfspecs                 r   
getDestStrr
  9  s;     rXXH}}JVVJOOJ..GE{{sEl""xa**YYvw011F"""rV})))61%%YYtW]1a0011xfeV44V}(((ww*5<88::V}+++#E&M22z%''V}***uV}q/@/@#E&M22z'-eDk::E5IIV}***uV}/A/A#E&M22z&M$KM$KM&M
 
 2r   lnkc           	         | j         }| }|d         }t          t          ||z                      }d}|d         t          j        k    r|d         dk    rt          j        d         }|d         }| j                            |          }	|                    dt          j	        dd                    }
| j        |         }|j         }| }|
|z  } ||	|j
        |j        |                    dd          |          }nt          j        d	         } |t          j        |d                   |          }n|d         t          j        k    r|d         dk    rt          j        d
         }|                    dt          j	        dd                    }
t          |
          t          j	        urt          j	        dd          }
 ||d         |
j
        |
j        |                    dd          |d         |d         |          }nt          j        d         } |t          j        |d                   |d         |          }n|d         t          j        k    r,t          j        d         } ||d         |d         |          }n|d         t          j        k    r%t          j        d         } ||d         |          }nS|d         t          j        k    r=t          j        d         }|                    d          }||d         } |||          }|s|S t%          d |                                 D                       }|                    dd          }|r!|d         |f|                                v r|}nFd}t          j                                        dz   }	 ||z  }||                                vrn|dz  }"|                    dd|z            }|S )Nr   rH   r   r
   r   goto1r   r   goto2gotor1r   gotor2launchr   r   name	nameddestc                 Z    g | ](}|d          t           j        k    |d         |d         f)S )r   r   r   )r   PDF_ANNOT_LINK)r>   r   s     r   r?   zgetLinkText.<locals>.<listcomp>  s5    TTT!QqTW=S5S5S!A$!5S5S5Sr   idr   z-L%iTr   z/Linkz/Link/NM(%s))transformation_matrixr   tupler   r   
annot_skelr   	page_xrefr   r   r   r   r   r   r   r   r   r   r   annot_xrefsitemsTOOLSset_annot_stemvaluesr   )r
   r  r   ictmrR   rg   annottxtpnor   r   	dest_pagedest_ctm	dest_ictmipntlname
link_namesold_namer  istems                        r   getLinkTextr-  s  s    
$C4DFAU1t8__%%DE
6{g'''v;!$W-Cf+C;((--D''$a 3 344CC(I 6H!	I?DCdfdfcggfa.@.@$GGEE$W-CC+CI66==EE	V*	*	*v;!$X.C''$a 3 344CCyy--mAq))CF""FF EE $X.CC+CI66FTJJEE	V+	+	+ *CKVd33	V(	(	( 'CJ%%	V*	*	* )=$EE4    TTt//11TTT J wwtR  H 	S[(+z/?/?/A/AAA}++--6	!8D:,,....FA		 MM'>D#899ELr   c                  <    d t          j                    D             S )zP
    Returns a list of upper-case colour names.
    :rtype: list of strings
    c                     g | ]	\  }}}}|
S r   r   )r>   r  rR   gr   s        r   r?   z getColorList.<locals>.<listcomp>  s     ???]T1aD???r   r   colors_wx_listr   r   r   getColorListr3    s"    
 @?g&<&>&>????r   c                  (    t          j                    S )z
    Returns list of (name, red, gree, blue) tuples, where:
        name: upper-case color name.
        read, green, blue: integers in range 0..255.
    :rtype: list of tuples
    r1  r   r   r   getColorInfoListr5    s     !###r   r  c                 t    t          j                                        |                                 d          S )zRetrieve RGB color in PDF format by name.

    Returns:
        a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
    )r   r   r   )r   colors_pdf_dictr   r   )r  s    r   getColorr8    s+     "$$((yAAAr   c                    	 t                      t                                          |                                                    }n+# t          $ r t
          rt          j                     Y dS w xY w|d         dz  }|d         dz  }|d         dz  }t          |||          }t          |dz  d          }t          |||          }||z
  }|dk    rd}	n8||k    rd||z
  |z  d	z  z  }	n#||k    rd||z
  |z  dz   z  }	nd||z
  |z  d
z   z  }	t          t          |	                    }
|dk    rd}n||z  }t          t          |dz                      }|
||fS )zRetrieve the hue, saturation, value triple of a color name.

    Returns:
        a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
    )rW   rW   rW   r   g     o@r   r   d   r   g      N@   r,   )r5  r3  indexupperr   r   r   r   rK   rM   r\   rL   )r  r   rR   r0  r   cmaxVcmindeltahueHsatSs                r   getColorHSVrF    s   |~~33DJJLLAAB   <G$:$<$<$<|| 	
!uA	!uA	!uAq!Q<<DdSj!Aq!Q<<D4KEzz	A!+,	A!+,A!+,E#JJAqyydlE#)Aq!9s   AA $A0/A0docc                 ,   |                      |          \  }}}}d}d}|dk    r|||||fS |r	 t          j        |          }|j        }|j        }|j        }	||z
  dk     r|	j        |k     r|	j        }d|z
  }n-# t          $ r  t          j                     |dz  }|dz  }Y nw xY w|||||fS |dk    rQ	 t          j        |          }|j        }|j        }n7# t          $ r  t          j                     |dz  }|dz  }Y nw xY w|dz  }|dz  }|||||fS )Ng?gɿrH   )
fontbufferr   g333333?zn/a)	extract_fontr   Fontascender	descenderr   r2   r   r   )
rG  r   fontnameextstypebufferascdscfontr   s
             r   _get_font_propertiesrU    s   #&#3#3D#9#9 Hc5&
C
C
byyeS#-- .	<6222D-C.C9DSy1}}7S=='C#g 	 	 	"$$$3JC3JCCC	 eS#--
e||	<))D-C.CC 	 	 	"$$$3JC3JCCC	
 	s
s
S%c))s$   A
A: :'B$#B$5"C 'DDc                 h    d}d}| j         j        }	 |sn|dz  }||j        z  }|j        }d| d| S )Nr   r   z
num_spans=z num_chars=)
m_internalheadrO   next)r`   	num_spans	num_charsspans       r   _show_fz_textr]  ;  sa    
 II?D 	Q	TX	y :	99i999r   c                    | \  }}|dd                              d          dd         }|ddd}d}t          |          D ]\  }} |rd}
| d	k    r||dz            |d
<   d}!|                     d          r8| dd                             dd                              dd          }||d<   n|                     d          rt	          | dd                   }||d<   |S )a"  Make a Python dict from a PDF page label rule.

    Args:
        item -- a tuple (pno, rule) with the start page number and the rule
                string like <</S/D...>>.
    Returns:
        A dict like
        {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
    r   r   r   NrH   )	startpageprefixfirstpagenumFrE  styleTP()ra  Strb  )split	enumerate
startswithr   rL   )itemr#  ruler   skipr+  r   s          r   	rule_dictrn  ^  s    IC":C  $DR;;ADT?? " "4 	D3;;a!eAgJD??3 	QRR  b))11#r::AAhK??4   	"DHA !AnHr   c                      fd|D             d         }t          |          }|                    dd          }|                    dd          }|dv rdnd} |d         z
  |d	         z   |z   }t          |||          S )
zReturn the label for this page number.

    Args:
        pgNo: page number, 0-based.
        labels: result of doc._get_page_labels().
    Returns:
        The label (str) of the page number. Errors return an empty string.
    c                 ,    g | ]}|d          k    |S )r   r   )r>   r   pgNos     r   r?   z!get_label_pno.<locals>.<listcomp>  s"    ...!1Ar   rW   ra  rH   rc  )r   Ar   r`  rb  )rn  r   construct_label)rq  labelsrk  rl  ra  rc  rA  
pagenumbers   `       r   get_label_pnorv    s     /...v...r2DT??DXXh##FHHWb!!E:%%BB1E[))D,@@5HJ5&*555r   c                 ~   d}| dk    rt          |          }n| dk    r"t          |                                          }nw| dk    r"t          |                                          }nO| dk    r"t	          |                                          }n'| dk    r!t	          |                                          }||z   }|S )z9Construct a label based on style, prefix and page number.rH   DrR   Rr   rr  )strintegerToRomanr   r=  integerToLetter)rc  ra  r#  n_strresults        r   rs  rs    s     E||C	#s##))++	#s##))++	#$$**,,	#$$**,,e^FMr   c           
         ddl }|j        }d| }}t          d|          |k    r>|t          t	          j        d|                    z  }|dz  }t          d|          |k    >d}t          t          |                    D ]B}t          |t          t	          j        d|                              \  }}|||         z  }|}C|S )z-Returns letter sequence string for integer i.r   Nr      rH   )stringascii_uppercasepowrL   mathreversedrangedivmod)	r+  r  lsr   r   str_tjr   r0  s	            r   r|  r|    s     MMM		BaqA
b!**//	S"a!!!	Q b!**// EeAhh  aTXb!__--..1ALr   numc                 b    dfd}d                     d  ||           D                       S )z$Return roman numeral for an integer.))i  M)i  CM)i  rx  )i  CD)r:  C)Z   XC)2   L)(   XL)
   X)	   IX)rZ   r?  )r,   IV)r   Ic              3   r   K   D ]0\  }}t          | |          \  }}||z  V  | ||z  z  } | dk    r d S 1d S r.   )r  )r  rR   ltrr   rc   romans        r   	roman_numz!integerToRoman.<locals>.roman_num  sd       	 	FAs#q>>DAq'MMM1q5LCaxx 		 	r   rH   c                     g | ]}|S r   r   )r>   r   s     r   r?   z"integerToRoman.<locals>.<listcomp>  s    ...!A...r   )join)r  r  r  s     @r   r{  r{    sN    E      77..yy~~...///r   line_dirr\  r   c                    | |d         } | \  }}t          j        |          }t           j                                        rd}n|d         |d         z
  }||d         z  }||z  }||z  }|dk    r7|dk    r1|j        d|fz
  }	|j        |dfz   }
|j        |dfz
  }|j        d|fz   }n|dk    r7|dk    r1|j        |dfz   }	|j        d|fz
  }
|j        d|fz   }|j        |dfz
  }nm|dk    r7|dk    r1|j        d|fz
  }	|j        |dfz   }
|j        |dfz
  }|j        d|fz   }n0|j        |dfz   }	|j        d|fz
  }
|j        d|fz   }|j        |dfz
  }t          j        |	|
||          S )a  Compute the quad located inside the bbox.

    The bbox may be any of the resp. tuples occurring inside the given span.

    Args:
        line_dir: (tuple) 'line["dir"]' of the owning line or None.
        span: (dict) the span. May be from get_texttrace() method.
        bbox: (tuple) the bbox of the span or any of its characters.
    Returns:
        The quad which is wrapped by the bbox.
    Ndirr   rL  rM  sizer   )	r   r0   r  set_small_glyph_heightsbltrbrtlQuad)r  r\  r   cossinr   r]   hshculurlllrs                r   recover_bbox_quadr    s    ;HC<D},,.. 1tK00fF 
#B	#B	Qww277W2wWAwWAwW2w	qR1WWWAwW2wW2wWAw	qR1WWW2wWAwWAwW2wWAwW2wW2wWAw<BB'''r   c                     t          |           t          ust          |           dk    rt          d          t          |          t          urt          d          t          | ||d                   S )zRecover the quadrilateral of a text span.

    Args:
        line_dir: (tuple) 'line["dir"]' of the owning line.
        span: the span.
    Returns:
        The quadrilateral enveloping the span's text.
    r   bad line dir argumentbad span argumentr   )r   r  rO   r"   r   r  )r  r\  s     r   recover_quadr    sj     H~~U""c(mmq&8&80111Dzz,---XtT&\:::r   r8   spansc                    || d         }t          |          dk    rt          d          | d         }|\  }}t          ||d                   }t          |          dk    rt          ||d                   }n|}|j        }|j        }t          j        ||          }	||	z  }
t
          j                                        t          fd|D                       }t          j
        d| |
j        d          }|j        }||	 z  }|S )	a  Calculate the line quad for 'dict' / 'rawdict' text extractions.

    The lower quad points are those of the first, resp. last span quad.
    The upper points are determined by the maximum span quad height.
    From this, compute a rect with bottom-left in (0, 0), convert this to a
    quad and rotate and shift back to cover the text of the spans.

    Args:
        spans: (list, optional) sub-list of spans to consider.
    Returns:
        pymupdf.Quad covering selected spans.
    Nr  r   zbad span listr  r   rW   c                 L    g | ] }|d          rdn|d         |d         z
  z  !S )r  r   rL  rM  r   )r>   ssmalls     r   r?   z%recover_line_quad.<locals>.<listcomp>B  s8    WWWA65Faaq}q~'E	GWWWr   )rO   r"   r  r  r  r   planish_liner  r  rK   r0   r   quad)r8   r  r  r  r  q0q1line_llline_lrmat0x_lrh	line_rect	line_quadr  s                 @r   recover_line_quadr    s    }W
5zzQ)))E{HHC	ha	)	)B
5zzA~~(E"I..eGeG11D T>DM1133EWWWWQVWWW	 	A QDFA..II$Ir   charsc                 ,   | |d         } |t          | |          S d|                                vrt          d          t          | ||d                   }t	          |          dk    rt          | ||d                   }n|}|j        }|j        }t          j        ||          }||z  }t          j	        
                                }	|d         |	rdn|d	         |d
         z
  z  }
t          j        d|
 |j        d          }|j        }|| z  }|S )a^  Calculate the span quad for 'dict' / 'rawdict' text extractions.

    Notes:
        There are two execution paths:
        1. For the full span quad, the result of 'recover_quad' is returned.
        2. For the quad of a sub-list of characters, the char quads are
           computed and joined. This is only supported for the "rawdict"
           extraction option.

    Args:
        line_dir: (tuple) 'line["dir"]' of the owning line.
        span: (dict) the span.
        chars: (list, optional) sub-list of characters to consider.
    Returns:
        pymupdf.Quad covering selected characters.
    Nr  r  z)need 'rawdict' option to sub-select charsr   r   rW   r  rL  rM  )r  r   r"   recover_char_quadrO   r  r  r   r  r  r  r0   r   r  )r  r\  r  r  r  span_llspan_lrr  r  r  r  	span_rect	span_quads                r   recover_span_quadr  K  s   " ;}Hd+++diikk!!DEEE	8T58	4	4B
5zzA~~xuRy99eGeG11DT>DM1133EVUNj)9D<M)MOAQDFA..II$Ir   charc                    | |d         } t          |           t          ust          |           dk    rt          d          t          |          t          urt          d          t          |          t          u rt          j        |d                   }n@t          |          t          u rt          j        |d                   }nt          d          t          | ||          S )aD  Recover the quadrilateral of a text character.

    This requires the "rawdict" option of text extraction.

    Args:
        line_dir: (tuple) 'line["dir"]' of the span's line.
        span: (dict) the span dict.
        char: (dict) the character dict.
    Returns:
        The quadrilateral enveloping the character.
    Nr  r   r  r  r   r   )r   r  rO   r"   r   r   r0   r  )r  r\  r  r   s       r   r  r  x  s     ;H~~U""c(mmq&8&80111Dzz,---DzzT|DL))	du		|DG$$,---XtT222r   )NNNF)NNNFNr   )NNNr   )N)NN)r   rr   rs   FN)r`   )Cr  typingr   rH   r   r   r   format_gr   r   r   r   r   r	   
ByteStringAttributeErrorbytes	bytearray
memoryviewAnyAnyTypeUnionrL   OptIntOptionalr   OptFloatrz  OptStrr   OptDictOptBytesSequenceOptSeqr   TextPageboollistr&   rC   rf   rk   rq   r   r   r   r
  r-  r3  r5  r  r8  rF  r   rU  r]  rn  rv  rs  r|  r{  r  r  r  r  r  r  r   r   r   <module>r     s        NNNNN   LLLLL 	3 
		0"JJ 0 0 0"Z/JJJ0 *	c4i	 ?5!		
/$
?:&		) !% 
,
  	
  
   H !%L L
,L
L L 	L
 L 
L L L Lb !%r r
,r
r r 	r 	r r r rp "& 
,
  		   ( !% 
, 	 	
    * Q Q
,QQ Q 
	Q
 Q Q Q Q Q Ql j !%j j j
,jj 	j
 j j j j j jZ8 8d 8 8 8 8v7S 7 7# 7 7 7 7tPgl P P# P P P PB@d @ @ @ @$$ $ $ $ $B3 B5 B B B B$c $e $ $ $ $N"*g. "*c "*e "* "* "* "*J: : :"$  D6 6 6*3    &#    $0 0 0 0 0 0D/( /(T /( /(7< /( /( /( /(d;5 ; ; ; ; ; ; * *D * * * * * *Z* * *T *$ *', * * * *Z3 3T 3 3', 3 3 3 3 3 3s*    	!!, 	88A A*)A*