
    h1                    J   d dl mZ d dlZd dlZd dlmZmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZ d d	lmZmZ  ej6                  e      Z ed
d      Z G d dee      Z G d de      Z  G d de!e      Z" ed       G d d             Z#ddZ$y)    )annotationsN)ABCabstractmethod)
CollectionIterableSequence)Set)	dataclass)Enum)AnyCallableLiteralOptionalTypeVarUnion)BaseDocumentTransformerDocumentTSTextSplitter)boundc                      e Zd ZdZddedddf	 	 	 	 	 	 	 	 	 	 	 	 	 ddZedd       Z	 d	 	 	 	 	 dd	Zdd
Z	ddZ
ddZedd       Zedd e       df	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Z	 	 	 	 	 	 ddZy)r   z)Interface for splitting text into chunks.i     FTc                    |dk  rd| }t        |      |dk  rd| }t        |      ||kD  rd| d| d}t        |      || _        || _        || _        || _        || _        || _        y)ad  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                            in each corresponding chunk (True='start')
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                              every document
        r   zchunk_size must be > 0, got z chunk_overlap must be >= 0, got zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)self
chunk_sizechunk_overlaplength_functionkeep_separatoradd_start_indexstrip_whitespacemsgs           \/opt/lhia/marcimex/python/venv/lib/python3.12/site-packages/langchain_text_splitters/base.py__init__zTextSplitter.__init__   s    * ?0=CS/!14]ODCS/!:%.}o ><46  S/!%+ /- /!1    c                     y)z$Split text into multiple components.N )r!   texts     r)   
split_textzTextSplitter.split_textE   s    r+   Nc           	        |xs i gt        |      z  }g }t        |      D ]  \  }}d}d}| j                  |      D ]  }	t        j                  ||         }
| j
                  r>||z   | j                  z
  }|j                  |	t        d|            }||
d<   t        |	      }t        |	|
      }|j                  |         |S )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater/   copydeepcopyr   r   findmaxr   append)r!   texts	metadatas
_metadatas	documentsir.   indexprevious_chunk_lenchunkr3   offsetnew_docs                r)   create_documentszTextSplitter.create_documentsI   s     32$U"3
	 ' 	*GAtE!". *==A7(("%77$:M:MMF IIeSF^<E.3H]+),U&"I  )*	* r+   c                    g g }}|D ]8  }|j                  |j                         |j                  |j                         : | j                  ||      S )zSplit documents.)r<   )r:   r2   r3   rE   )r!   r>   r;   r<   docs        r)   split_documentszTextSplitter.split_documents]   sV    ry 	+CLL))*S\\*	+ $$Ui$@@r+   c                l    |j                  |      }| j                  r|j                         }|dk(  ry |S )N )joinr    strip)r!   docs	separatorr.   s       r)   
_join_docszTextSplitter._join_docse   s3    ~~d#!!::<D2:r+   c                d   | j                  |      }g }g }d}|D ]m  }| j                  |      }||z   t        |      dkD  r|ndz   | j                  kD  r
|| j                  kD  r%t        j	                  d| d| j                          t        |      dkD  r| j                  ||      }	|	|j                  |	       || j                  kD  s*||z   t        |      dkD  r|ndz   | j                  kD  ro|dkD  rj|| j                  |d         t        |      dkD  r|ndz   z  }|dd  }|| j                  kD  r?||z   t        |      dkD  r|ndz   | j                  kD  r|dkD  rj|j                  |       ||t        |      dkD  r|ndz   z  }p | j                  ||      }	|	|j                  |	       |S )Nr   zCreated a chunk of size z%, which is longer than the specified    )r   r4   r   loggerwarningrO   r:   r   )
r!   splitsrN   separator_lenrM   current_doctotald_lenrG   s
             r)   _merge_splitszTextSplitter._merge_splitsm   s    --i8!# 	KA((+D[1AA1E1M""# 4+++NN25' :>>B>N>N=OQ {#a'//+yACC(  $"5"55[9IA9MSTU**+!AI!6!6{1~!F-0-=-AMq"  '2!"o  $"5"55[9IA9MSTU**+!AI q!Tc+.>.B]JJE9	K: ook95?KKr+   c                    	 ddl m} t        |      sd}t        |      dfd} | dd|i|S # t        $ r d}t        |      w xY w)	z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBasec                8    t        j                  |             S N)r4   tokenizer.   	tokenizers    r)   _huggingface_tokenizer_lengthzNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    9--d344r+   z`Could not import transformers python package. Please install it with `pip install transformers`.r$   r.   strreturnintr-   )$transformers.tokenization_utils_baser\   
isinstancer   ImportError)clsra   kwargsr\   r(   rb   s    `    r)   from_huggingface_tokenizerz'TextSplitter.from_huggingface_tokenizer   sm    	"Ti)@AW  !o%5 K#@KFKK  	"E  S/!	"s	   %2 A	gpt2allc                   
 	 ddl }||j                  |      
n|j                  |      
d
fd}t	        | t
              r||d}	i ||	} | dd|i|S # t        $ r d}t        |      w xY w)	z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.c                >    t        j                  |             S N)allowed_specialdisallowed_special)r4   encode)r.   rr   rs   encs    r)   _tiktoken_encoderz=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder   s*    

$3'9   r+   )encoding_name
model_namerr   rs   r$   rc   r-   )tiktokenri   encoding_for_modelget_encoding
issubclassTokenTextSplitter)rj   rw   rx   rr   rs   rk   ry   r(   rv   extra_kwargsru   s      ``     @r)   from_tiktoken_encoderz"TextSplitter.from_tiktoken_encoder   s    	# !--j9C''6C	 c,-!.(#2&8	L 0/,/F?#4????  	#A 
 c""	#s   A A4c                6    | j                  t        |            S )z2Transform sequence of documents by splitting them.)rH   list)r!   r>   rk   s      r)   transform_documentsz TextSplitter.transform_documents   s     ##DO44r+   )r"   rf   r#   rf   r$   zCallable[[str], int]r%   z$Union[bool, Literal['start', 'end']]r&   boolr'   r   re   Noner.   rd   re   	list[str]r^   )r;   r   r<   zOptional[list[dict[Any, Any]]]re   list[Document])r>   zIterable[Document]re   r   )rM   r   rN   rd   re   Optional[str])rT   zIterable[str]rN   rd   re   r   )ra   r   rk   r   re   r   )rj   ztype[TS]rw   rd   rx   r   rr   'Union[Literal['all'], AbstractSet[str]]rs   &Union[Literal['all'], Collection[str]]rk   r   re   r   )r>   Sequence[Document]rk   r   re   r   )__name__
__module____qualname____doc__r4   r*   r   r/   rE   rH   rO   rZ   classmethodrl   setr   r   r-   r+   r)   r   r      sQ   3  03?D %!%&2&2 &2 .	&2
 =&2 &2 &2 
&2P 3 3 MQ+I	(A(T L L,  $$(CF5EJ*@*@*@ "*@ A	*@
 C*@ *@ 
*@ *@X5+57:5	5r+   c                  V     e Zd ZdZdd e       df	 	 	 	 	 	 	 	 	 	 	 d fdZddZ xZS )	r}   z/Splitting text to tokens using model tokenizer.rm   Nrn   c                    t        	|   di | 	 ddl}||j	                  |      }n|j                  |      }|| _        || _        || _        y# t        $ r d}t        |      w xY w)zCreate a new TextSplitter.r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.r-   )	superr*   ry   ri   rz   r{   
_tokenizer_allowed_special_disallowed_special)
r!   rw   rx   rr   rs   rk   ry   r(   ru   	__class__s
            r)   r*   zTokenTextSplitter.__init__   s     	"6"	# !--j9C''6C /#5   	#A 
 c""	#s   A A(c                     d fd}t         j                   j                   j                  j                  |      }t        ||      S )a  Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.

        Args:
            text (str): The input text to be split into smaller chunks.

        Returns:
            List[str]: A list of text chunks, where each chunk is derived from a portion
            of the input text based on the tokenization and chunking rules.
        c                h    j                   j                  | j                  j                        S rq   )r   rt   r   r   )_textr!   s    r)   _encodez-TokenTextSplitter.split_text.<locals>._encode  s4    ??)) $ 5 5#'#;#; *  r+   )r#   tokens_per_chunkdecodert   r`   )r   rd   re   z	list[int])	Tokenizerr   r   r   r   split_text_on_tokens)r!   r.   r   ra   s   `   r)   r/   zTokenTextSplitter.split_text  sE     	 --!--??))	
	 $CCr+   )rw   rd   rx   r   rr   r   rs   r   rk   r   re   r   r   )r   r   r   r   r   r*   r/   __classcell__)r   s   @r)   r}   r}      s]    9 $$(CF5EJ66 "6 A	6
 C6 6 
68Dr+   r}   c                  |    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZy)Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellvisualbasic6N)r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLVISUALBASIC6r-   r+   r)   r   r   "  s    ,
C	BDF	B	B
CEF
CDDEEHED
CFEA
CDGFJ!Lr+   r   T)frozenc                  @    e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   y	)
r   zTokenizer data class.rf   r#   r   zCallable[[list[int]], str]r   zCallable[[str], list[int]]rt   N)r   r   r   r   __annotations__r-   r+   r)   r   r   B  s)    *,&&=&&=r+   r   c                   g }|j                  |       }d}t        ||j                  z   t        |            }||| }|t        |      k  r|j	                  |j                  |             |t        |      k(  r	 |S ||j                  |j                  z
  z  }t        ||j                  z   t        |            }||| }|t        |      k  r|S )z6Split incoming text and return chunks using tokenizer.r   )rt   minr   r4   r:   r   r#   )r.   ra   rT   	input_ids	start_idxcur_idx	chunk_idss          r)   r   r   P  s    F  &II)i888#i.IG)G,I
c)n
$i&&y12c)n$ M 	Y//)2I2III	i)"<"<<c)nMi0	 c)n
$ Mr+   )r.   rd   ra   r   re   r   )%
__future__r   r6   loggingabcr   r   collections.abcr   r   r   r	   AbstractSetdataclassesr
   enumr   typingr   r   r   r   r   r   langchain_core.documentsr   r   	getLoggerr   rR   r   r   r}   rd   r   r   r   r-   r+   r)   <module>r      s    "   # : : . !   G			8	$T(E5*C E5P=D =D@"sD "@ $
> 
> 
>r+   