
    j
i;                       d Z ddlmZ ddlZddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZ ddlmZmZ dd	lmZmZ erdd
lmZmZmZmZ ddlmZ 	 ddlZdZn# e $ r dZY nw xY w	 ddl!m"Z" dZ#n# e $ r dZ#Y nw xY w ej$        e%          Z& edd          Z' G d dee          Z( G d de(          Z) G d de*e          Z+ e	d           G d d                      Z,d dZ-dS )!zText splitter base interface.    )annotationsN)ABCabstractmethod)	dataclass)Enum)TYPE_CHECKINGAnyLiteralTypeVar)BaseDocumentTransformerDocument)Selfoverride)Callable
CollectionIterableSequence)SetTF)PreTrainedTokenizerBaseTSTextSplitter)boundc                      e Zd ZdZddedddfd8dZed9d            Z	 d:d;dZd<dZ	d=d#Z
d>d&Zed?d+            Zed,d e            d-fd@d5            ZedAd7            ZdS )Br   z)Interface for splitting text into chunks.i     FT
chunk_sizeintchunk_overlaplength_functionCallable[[str], int]keep_separatorbool | Literal['start', 'end']add_start_indexboolstrip_whitespacereturnNonec                    |dk    rd| }t          |          |dk     rd| }t          |          ||k    rd| d| d}t          |          || _        || _        || _        || _        || _        || _        dS )a$  Create a new `TextSplitter`.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                in each corresponding chunk `(True='start')`
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                every document

        Raises:
            ValueError: If `chunk_size` is less than or equal to 0
            ValueError: If `chunk_overlap` is less than 0
            ValueError: If `chunk_overlap` is greater than `chunk_size`
        r   zchunk_size must be > 0, got z chunk_overlap must be >= 0, got zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)selfr   r   r   r    r"   r$   msgs           C:\Users\Dell Inspiron 16\Desktop\tws\AgrotaPowerBi\back-agrota-powerbi\mcp-client-agrota\venv\Lib\site-packages\langchain_text_splitters/base.py__init__zTextSplitter.__init__/   s    4 ??===CS//!1D]DDCS//!:%%6} 6 66 6 6  S//!%+ /- /!1    textstr	list[str]c                    dS )zSplit text into multiple components.

        Args:
            text: The text to split.

        Returns:
            A list of text chunks.
        N )r/   r4   s     r1   
split_textzTextSplitter.split_text\   s      r3   Ntexts	metadataslist[dict[Any, Any]] | Nonelist[Document]c           	        |pi gt          |          z  }g }t          |          D ]\  }}d}d}|                     |          D ]}	t          j        ||                   }
| j        rE||z   | j        z
  }|                    |	t          d|                    }||
d<   t          |	          }t          |	|
          }|
                    |           |S )a,  Create a list of `Document` objects from a list of texts.

        Args:
            texts: A list of texts to be split and converted into documents.
            metadatas: Optional list of metadata to associate with each document.

        Returns:
            A list of `Document` objects.
        r   start_index)page_contentmetadata)len	enumerater9   copydeepcopyr-   r*   findmaxr   append)r/   r:   r;   
metadatas_	documentsir4   indexprevious_chunk_lenchunkrA   offsetnew_docs                r1   create_documentszTextSplitter.create_documentsg   s     32$U"3
	 '' 	* 	*GAtE!".. * *=A77( 4"%77$:MMF IIeSF^^<<E.3H]+),U&"III  ))))* r3   rJ   Iterable[Document]c                    g g }}|D ]6}|                     |j                   |                     |j                   7|                     ||          S )zSplit documents.

        Args:
            documents: The documents to split.

        Returns:
            A list of split documents.
        )r;   )rH   r@   rA   rQ   )r/   rJ   r:   r;   docs        r1   split_documentszTextSplitter.split_documents   sc     ry 	+ 	+CLL)***S\****$$Ui$@@@r3   docs	separator
str | Nonec                j    |                     |          }| j        r|                                }|pd S N)joinr.   strip)r/   rV   rW   r4   s       r1   
_join_docszTextSplitter._join_docs   s5    ~~d##! 	 ::<<D|tr3   splitsIterable[str]c                   |                      |          }g }g }d}|D ]}|                      |          }||z   t          |          dk    r|ndz   | j        k    r|| j        k    r!t                              d|| j                   t          |          dk    r|                     ||          }	|	|                    |	           || j        k    s,||z   t          |          dk    r|ndz   | j        k    r}|dk    rw||                      |d                   t          |          dk    r|ndz   z  }|dd          }|| j        k    K||z   t          |          dk    r|ndz   | j        k    r|dk    w|                    |           ||t          |          dk    r|ndz   z  }|                     ||          }	|	|                    |	           |S )Nr   zACreated a chunk of size %d, which is longer than the specified %d   )r+   rB   r)   loggerwarningr]   rH   r*   )
r/   r^   rW   separator_lenrV   current_doctotaldlen_rT   s
             r1   _merge_splitszTextSplitter._merge_splits   s>    --i88!# 	K 	KA((++D[1A1AA1E1E1M"# # 4+++NN'(	   {##a''//+yAACC(((  $"555[9I9IA9M9MSTU*+ +!AII!6!6{1~!F!F-0-=-=-A-AMMq"  '2!""o  $"555[9I9IA9M9MSTU*+ +!AII q!!!Tc+.>.>.B.B]]JJEEook955?KKr3   	tokenizerr   kwargsr	   c                    t           sd}t          |          t          t                    sd}t          |          d	fd} | d
d|i|S )a  Text splitter that uses Hugging Face tokenizer to count length.

        Args:
            tokenizer: The Hugging Face tokenizer to use.

        Returns:
            An instance of `TextSplitter` using the Hugging Face tokenizer for length
                calculation.
        z`Could not import transformers python package. Please install it with `pip install transformers`.zATokenizer received was not an instance of PreTrainedTokenizerBaser4   r5   r%   r   c                H    t                              |                     S rZ   )rB   tokenizer4   rj   s    r1   _huggingface_tokenizer_lengthzNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    y))$//000r3   r   r4   r5   r%   r   r8   )_HAS_TRANSFORMERSr(   
isinstancer   )clsrj   rk   r0   rp   s    `   r1   from_huggingface_tokenizerz'TextSplitter.from_huggingface_tokenizer   s     ! 	"E  S//!)%<== 	"UCS//!	1 	1 	1 	1 	1 	1 sKK#@KFKKKr3   gpt2allencoding_name
model_nameallowed_special!Literal['all'] | AbstractSet[str]disallowed_special Literal['all'] | Collection[str]r   c                   	 t           sd}t          |          |t          j        |          	nt          j        |          	d
	fd}t          | t                    r||d}i ||} | dd	|i|S )am  Text splitter that uses `tiktoken` encoder to count length.

        Args:
            encoding_name: The name of the tiktoken encoding to use.
            model_name: The name of the model to use.

                If provided, this will override the `encoding_name`.
            allowed_special: Special tokens that are allowed during encoding.
            disallowed_special: Special tokens that are disallowed during encoding.

        Returns:
            An instance of `TextSplitter` using tiktoken for length calculation.

        Raises:
            ImportError: If the tiktoken package is not installed.
        zCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.Nr4   r5   r%   r   c                N    t                              |                     S N)rz   r|   )rB   encode)r4   rz   r|   encs    r1   _tiktoken_encoderz=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder  s4    

$3'9     r3   )rx   ry   rz   r|   r   rq   r8   )_HAS_TIKTOKENImportErrortiktokenencoding_for_modelget_encoding
issubclassTokenTextSplitter)
rt   rx   ry   rz   r|   rk   r0   r   extra_kwargsr   s
      ``    @r1   from_tiktoken_encoderz"TextSplitter.from_tiktoken_encoder   s    2  	#A 
 c"""!-j99CC'66C	 	 	 	 	 	 	 	 c,-- 	0!.(#2&8	 L 0/,/Fs??#4????r3   Sequence[Document]c                F    |                      t          |                    S )zTransform sequence of documents by splitting them.

        Args:
            documents: The sequence of documents to split.

        Returns:
            A list of split documents.
        )rU   list)r/   rJ   rk   s      r1   transform_documentsz TextSplitter.transform_documents  s     ##DOO444r3   )r   r   r   r   r   r   r    r!   r"   r#   r$   r#   r%   r&   r4   r5   r%   r6   rZ   )r:   r6   r;   r<   r%   r=   )rJ   rR   r%   r=   )rV   r6   rW   r5   r%   rX   )r^   r_   rW   r5   r%   r6   )rj   r   rk   r	   r%   r   )rx   r5   ry   rX   rz   r{   r|   r}   rk   r	   r%   r   )rJ   r   rk   r	   r%   r   )__name__
__module____qualname____doc__rB   r2   r   r9   rQ   rU   r]   ri   classmethodru   setr   r   r   r8   r3   r1   r   r   ,   sL       33  039> %!%+2 +2 +2 +2 +2Z    ^ JN    8A A A A   * * * *X L L L [L8  $!%=@SUU?D7@ 7@ 7@ 7@ [7@r 5 5 5 X5 5 5r3   c                  F     e Zd ZdZdd e            dfd fdZddZ xZS )r   z/Splitting text to tokens using model tokenizer.rv   Nrw   rx   r5   ry   rX   rz   r{   r|   r}   rk   r	   r%   r&   c                     t                      j        di | t          sd}t          |          |t	          j        |          }nt	          j        |          }|| _        || _        || _	        dS )a  Create a new `TextSplitter`.

        Args:
            encoding_name: The name of the tiktoken encoding to use.
            model_name: The name of the model to use.

                If provided, this will override the `encoding_name`.
            allowed_special: Special tokens that are allowed during encoding.
            disallowed_special: Special tokens that are disallowed during encoding.

        Raises:
            ImportError: If the tiktoken package is not installed.
        zCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.Nr8   )
superr2   r   r   r   r   r   
_tokenizer_allowed_special_disallowed_special)	r/   rx   ry   rz   r|   rk   r0   r   	__class__s	           r1   r2   zTokenTextSplitter.__init__-  s    * 	""6""" 	#A 
 c"""!-j99CC'66C /#5   r3   r4   r6   c                     d fd}t           j         j         j        j        |          }t          ||          S )	ar  Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.

        Args:
            text: The input text to be split into smaller chunks.

        Returns:
            A list of text chunks, where each chunk is derived from a portion
                of the input text based on the tokenization and chunking rules.
        _textr5   r%   	list[int]c                R    j                             | j        j                  S r   )r   r   r   r   )r   r/   s    r1   _encodez-TokenTextSplitter.split_text.<locals>._encodec  s1    ?)) $ 5#'#; *   r3   )r   tokens_per_chunkdecoder   ro   )r   r5   r%   r   )	Tokenizerr*   r)   r   r   split_text_on_tokens)r/   r4   r   rj   s   `   r1   r9   zTokenTextSplitter.split_textS  sc     	 	 	 	 	 	 -!-?)	
 
 
	 $CCCCr3   )rx   r5   ry   rX   rz   r{   r|   r}   rk   r	   r%   r&   r   )r   r   r   r   r   r2   r9   __classcell__)r   s   @r1   r   r   *  sy        99 $!%=@SUU?D$6 $6 $6 $6 $6 $6 $6LD D D D D D D Dr3   r   c                      e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellvisualbasic6N) r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLVISUALBASIC6r8   r3   r1   r   r   t  s        ,,
C	BDF	B	B
CEFA
CDDEEHED
CFEA
CDGFJ!LLLr3   r   )frozenc                  B    e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   d	S )
r   zTokenizer data class.r   r   r   zCallable[[list[int]], str]r   zCallable[[str], list[int]]r   N)r   r   r   r   __annotations__r8   r3   r1   r   r     sQ         *,&&&&=&&&&==r3   r   r4   r5   rj   r%   r6   c                   g }|                     |           }d}|j        |j        k    rd}t          |          |t	          |          k     rt          ||j        z   t	          |                    }|||         }|sne|                    |          }|r|                    |           |t	          |          k    rn%||j        |j        z
  z  }|t	          |          k     |S )zSplit incoming text and return chunks using tokenizer.

    Args:
        text: The input text to be split.
        tokenizer: The tokenizer to use for splitting.

    Returns:
        A list of text chunks.
    r   z3tokens_per_chunk must be greater than chunk_overlap)r   r   r   r(   rB   minr   rH   )	r4   rj   r^   	input_ids	start_idxr0   cur_idx	chunk_idsdecodeds	            r1   r   r     s     F  &&II!Y%<<<Coo
c)nn
$
$i)"<<c)nnMMi/0	 	""9-- 	#MM'"""c)nn$$Y/)2III	 c)nn
$
$ Mr3   )r4   r5   rj   r   r%   r6   ).r   
__future__r   rD   loggingabcr   r   dataclassesr   enumr   typingr   r	   r
   r   langchain_core.documentsr   r   typing_extensionsr   r   collections.abcr   r   r   r   r   AbstractSetr   r   r   $transformers.tokenization_utils_baser   rr   	getLoggerr   rb   r   r   r   r5   r   r   r   r8   r3   r1   <module>r      s   # # " " " " " "   # # # # # # # # ! ! ! ! ! !                  G F F F F F F F , , , , , , , , 3HHHHHHHHHHHH222222OOOMM   MMMLLLLLL    
	8	$	$WT((({5 {5 {5 {5 {5*C {5 {5 {5|GD GD GD GD GD GD GD GDT" " " " "sD " " "B $> > > > > > > >      s$   A A'&A'+A4 4A>=A>