
    i]>                        d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlZd dlmZ e	rd dlZd dlZd dlZi Z	 d	Z	 d'dZd(dZ	 	 	 d)d*dZ G d  d!          Z	 	 	 d+d,d&ZdS )-    )annotationsN)bisect_left)CallableSequenceTYPE_CHECKING)cache)
accumulate)suppress)tqdm).?!*;,()[]u   “u   ”u   ‘u   ’'"`:u   —u   …/\u   –&-textstrreturntuple[str, bool, list[str]]c                   d}d| v sd| v r#t          t          j        d|                     }nd| v r#t          t          j        d|                     }n\t          j        d|           r#t          t          j        d|                     }n$t          D ]
}|| v rd	} nd
|t          |           fS |||                     |          fS )zDSplit text using the most semantically meaningful splitter possible.T
z[\r\n]+	z\t+z\sz\s+F )maxrefindallsearch"_NON_WHITESPACE_SEMANTIC_SPLITTERSlistsplit)r   splitter_is_whitespacesplitters      i/var/www/development/aibuddy-work/election-extract/venv/lib/python3.11/site-packages/semchunk/semchunk.py_split_textr1      s     " t||tt||rz*d3344	rz&$//00	5$		 :rz&$//00 ; 	: 	:H4).&   -tDzz99 +TZZ-A-AAA    splits	list[str]
chunk_sizeintr/   token_counterr   tuple[int, str]c                   d}d}t          |           dz   }t          t          d | D             d                    }|                    |d                    ||k     rt	          |||dz            ||z            }t          ||z   |dz
            }	 ||                    | d|	                             }
||	         r|
dk    r||	         |
z  n|}|
|k    r|	}n|	dz   }||k     |dz
  |                    | d|dz
                     fS )zMerge splits until a chunk size is reached, returning the index of the last split included in the merged chunk along with the merged chunk itself.g?r      c                ,    g | ]}t          |          S  len).0r-   s     r0   
<listcomp>z merge_splits.<locals>.<listcomp>F   s    )I)I)I#e**)I)I)Ir2   )initialN)r>   r,   r	   appendr   minjoin)r3   r5   r/   r7   averagelowhighcumulative_lengthsimidpointtokenss              r0   merge_splitsrM   @   sD    G
Cv;;?Dj)I)I&)I)I)ISTUUUVV04555
***3>:J<PQQq3wq))x}}VIXI->??@@;Mh;Ws\bef\f\f$X.77lsJDDQ,C ** 7HMM&#'"23333r2   TFCallable[[str], int]memoizebool_recursion_depth_reattach_whitespace_splittersc                `   |s*|r(t                               |t          |                    }t          |           \  }}}|rd}g }	t	                      	 t          |          D ](\  }
}|
v r ||          |k    r,|	                    t          |||||dz   |                     n[t          ||
d         |||          \  }}	                    t          |
dz   |
|z                        |	                    |           |s|
t          |          dz
  k    sot          fdt          |
dz   t          |                    D                       s6 ||	d         |z   x}          |k    r||	d<   |	                    |           *|st          t          d|	                    }	|	S )aO  Split a text into semantically meaningful chunks of a specified size as determined by the provided token counter.

    Args:
        text (str): The text to be chunked.
        chunk_size (int): The maximum number of tokens a chunk may contain.
        token_counter (Callable[[str], int]): A callable that takes a string and returns the number of tokens in it.
        memoize (bool, optional): Whether to memoize the token counter. Defaults to `True`.
    
    Returns:
        list[str]: A list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed.Fr:   )r7   rO   rQ   rR   Nc              3      K   | ]}|v V  	d S Nr<   )r?   jskipss     r0   	<genexpr>zchunk.<locals>.<genexpr>   s)      JyJyZ[1PU:JyJyJyJyJyJyr2   rB   )_memoized_token_counters
setdefaultr   r1   set	enumerateextendchunkrM   updaterangerC   r>   allr,   filter)r   r5   r7   rO   rQ   rR   r/   r.   r3   chunksrJ   r-   final_split_in_chunk_i	new_chunklast_chunk_with_splitterrW   s                  @r0   r^   r^   X   s%   (  a a0;;M5Q^K_K_`` 0;4/@/@,H$f%E'=FEEE^ f%% ( (5:: =*,,MM%z=\c  yI  LM  yM  pN  O  O  O  P  P  P  P
 1=VABBZU]_l0m0m-"I LLq1ua*@&@AABBB MM)$$$ & 	(qCKK!O/C/CsJyJyJyJy_defijejlopvlwlw_x_xJyJyJyGyGy/C}h9NN5OOS]]]5r

h'''  ,fT6**++Mr2   c                  ,    e Zd ZddZddZ	 	 dddZdS )Chunkerr5   r6   r7   rN   r    Nonec                "    || _         || _        d S rU   )r5   r7   )selfr5   r7   s      r0   __init__zChunker.__init__   s    $*r2   r   r   r4   c                <    t          || j        | j        d          S )zChunk a text.F)rO   )r^   r5   r7   )rk   r   s     r0   r^   zChunker.chunk   s      T4?D,>%PPPPr2   r:   Ftext_or_textsstr | Sequence[str]	processesprogressrP   list[str] | list[list[str]]c                H    t          |t                    r                     |          S |r|dk    rt          |          }|dk    r fd|D             S t	          j        |d          5 }|                     j        ||          cddd           S # 1 swxY w Y   dS )a{  Split text or texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter.
        
        Args:
            text_or_texts (str | Sequence[str]): The text or texts to be chunked.
        
        Returns:
            list[str] | list[list[str]]: If a single text has been provided, a list of chunks up to `chunk_size`-tokens-long, with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
            processes (int, optional): The number of processes to use when chunking multiple texts. Defaults to `1` in which case chunking will occur in the main process.
            progress (bool, optional): Whether to display a progress bar when chunking multiple texts. Defaults to `False`.r:   c                :    g | ]}                     |          S r<   )r^   )r?   r   rk   s     r0   r@   z$Chunker.__call__.<locals>.<listcomp>   s%    ???DJJt$$???r2   T)use_dill)progress_barN)
isinstancer   r^   r   mpire
WorkerPoolmap)rk   rn   rp   rq   pools   `    r0   __call__zChunker.__call__   s
    mS)) 	-::m,,, 	0	Q //M>>????????iD999 	PT88DJh8OO	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	P 	Ps   -BBBN)r5   r6   r7   rN   r    ri   )r   r   r    r4   )r:   F)rn   ro   rp   r6   rq   rP   r    rr   )__name__
__module____qualname__rl   r^   r|   r<   r2   r0   rh   rh      sh        + + + +Q Q Q Q 	P P P P P P Pr2   rh   tokenizer_or_token_counterhstr | tiktoken.Encoding | transformers.PreTrainedTokenizer | tokenizers.Tokenizer | Callable[[str], int]
int | Nonemax_token_charsc                    t           t                    r	 ddl}	  |j                   }n # t          $ r  |j                   }Y nw xY wnQ# t          $ rD 	 ddl}|j                                       }n!# t          $ r t          d  d          w xY wY nw xY w| dD ]}t           |          r{t          t           |                    r^ t           |                      }t          |d          r6|r4t          d |D                       rt          d |D                        nt           d	          rt           j        t                     ri j        t           d
          rQt#          t                    5  t%                               d                    z  ddd           n# 1 swxY w Y   nt          d          t           d
          r.dt)          j         j                  j        v rd fd}	n	d fd}	n }	dz
  |	dfd}
|
}	|r(t.                              |	t3          |	                    }	t5          |	          S )aI	  Construct a chunker that splits one or more texts into semantically meaningful chunks of a specified size as determined by the provided tokenizer or token counter.
    
    Args:
        tokenizer_or_token_counter (str | tiktoken.Encoding | transformers.PreTrainedTokenizer | tokenizers.Tokenizer | Callable[[str], int]): Either: the name of a `tiktoken` or `transformers` tokenizer (with priority given to the former); a tokenizer that possesses an `encode` attribute (eg, a `tiktoken`, `transformers` or `tokenizers` tokenizer); or a token counter that returns the number of tokens in a input.
        chunk_size (int, optional): The maximum number of tokens a chunk may contain. Defaults to `None` in which case it will be set to the same value as the tokenizer's `model_max_length` attribute (deducted by the number of tokens returned by attempting to tokenize an empty string) if possible otherwise a `ValueError` will be raised.
        max_token_chars (int, optional): The maximum numbers of characters a token may contain. Used to significantly speed up the token counting of long inputs. Defaults to `None` in which case it will either not be used or will, if possible, be set to the numbers of characters in the longest token in the tokenizer's vocabulary as determined by the `token_byte_values` or `get_vocab` methods.
        memoize (bool, optional): Whether to memoize the token counter. Defaults to `True`.
    
    Returns:
        Callable[[str | Sequence[str], bool, bool], list[str] | list[list[str]]]: A chunker that takes either a single text or a sequence of texts and returns, if a single text has been provided, a list of chunks up to `chunk_size`-tokens-long with any whitespace used to split the text removed, or, if multiple texts have been provided, a list of lists of chunks, with each inner list corresponding to the chunks of one of the provided input texts.
        
        The resulting chunker can be passed a `processes` argument that specifies the number of processes to be used when chunking multiple texts.
        
        It is also possible to pass a `progress` argument which, if set to `True` and multiple texts are passed, will display a progress bar.
        
        Technically, the chunker will be an instance of the `semchunk.Chunker` class to assist with type hinting, though this should have no impact on how it can be used.r   Nr   z" was provided to `semchunk.chunkerify` as the name of a tokenizer but neither `tiktoken` nor `transformers` have a tokenizer by that name. Perhaps they are not installed or maybe there is a typo in that name?)token_byte_values	get_vocab__iter__c              3  6   K   | ]}t          |d           V  dS )__len__N)hasattrr?   tokens     r0   rX   zchunkerify.<locals>.<genexpr>   s-      ?m?m^cy@Y@Y?m?m?m?m?m?mr2   c              3  4   K   | ]}t          |          V  d S rU   r=   r   s     r0   rX   zchunkerify.<locals>.<genexpr>   s(      )H)H#e**)H)H)H)H)H)Hr2   model_max_lengthencoder&   a!  Your desired chunk size was not passed to `semchunk.chunkerify` and the provided tokenizer either lacks an attribute named 'model_max_length' or that attribute is not an integer. Either specify a chunk size or provide a tokenizer that has a 'model_max_length' attribute that is an integer.add_special_tokensr   r   r    r6   c                L    t                              | d                    S )NF)r   r>   r   r   r   s    r0   r7   z!chunkerify.<locals>.token_counter  s&    5<<TX]<^^___r2   c                H    t                              |                     S rU   r   r   s    r0   r7   z!chunkerify.<locals>.token_counter  s     5<<TBBCCCr2   r:   c                    dz  }t          |           |k    r | d |z                      k    rdz   S  |           S )N   r:   r=   )r   	heuristicr5   r   original_token_counters     r0   faster_token_counterz(chunkerify.<locals>.faster_token_counter  s`    "QI4yy9$$)?)?EaiRaFaEa@b)c)cfp)p)p!A~%))$///r2   )r   r   r    r6   )rw   r   tiktokenencoding_for_model	Exceptionget_encodingtransformersAutoTokenizerfrom_pretrained
ValueErrorr   callablegetattrra   r'   r   r6   r
   r>   r   inspect	signature
parametersrY   rZ   r   rh   )r   r5   r   rO   r   	tokenizerr   $potential_vocabulary_getter_functionvocabr7   r   r   s   ```        @r0   
chunkerifyr      s   2 ,c22 /	TOOON7H78RSS		 N N N1H12LMM			N  	T 	T 	TT####(6FFGabb		 T T T   "S%?  "S  "S  "S  T  T  TT 			T &/" 5
 		 		0 13WXX ]efm  oI  Ko  gp  gp  ^q  ^q a :<`aacc5*-- % C?m?mgl?m?m?m<m<m &))H)H%)H)H)H&H&HOE -/ABB 		BzRlR}  @C  HD  HD 		B3DJ 18<< Mi(( M M#&@&G&G&K&K"L"LLJM M M M M M M M M M M M M M M   A  B  B  B )844 37#45O5V#W#W#bbb` ` ` ` ` ` `D D D D D D D 3 ")A-!.	0 	0 	0 	0 	0 	0 	0 	0 -  a0;;M5Q^K_K_`` :}---s[   A 1 A AA AA 
B A<;B <BB B &G  GG)r   r   r    r!   )
r3   r4   r5   r6   r/   r   r7   r   r    r8   )Tr   F)r   r   r5   r6   r7   rN   rO   rP   rQ   r6   rR   rP   r    r4   )NNT)
r   r   r5   r   r   r   rO   rP   r    rh   )
__future__r   r(   r   bisectr   typingr   r   r   	functoolsr   	itertoolsr	   
contextlibr
   rx   r   r   
tokenizersr   rY   r+   r1   rM   r^   rh   r   r<   r2   r0   <module>r      s   " " " " " " 				        4 4 4 4 4 4 4 4 4 4                                ;;;;;;;;;;;;;   9& " SB B B BB4 4 4 48 +0? ? ? ? ?D#P #P #P #P #P #P #P #PP ""&j. j. j. j. j. j. j.r2   