
    j
iyK                        d Z ddlmZ ddlZddlmZmZ ddlmZ ddl	m
Z
 ddlmZ  G d d	e          Z G d
 d          Z G d de          Z G d de          Z G d d          ZdS )zMarkdown text splitters.    )annotationsN)Any	TypedDictDocument)Language)RecursiveCharacterTextSplitterc                  $     e Zd ZdZd fdZ xZS )MarkdownTextSplitterz=Attempts to split the text along Markdown-formatted headings.kwargsr   returnNonec                |    |                      t          j                  } t                      j        dd|i| dS )z$Initialize a `MarkdownTextSplitter`.
separatorsN )get_separators_for_languager   MARKDOWNsuper__init__)selfr   r   	__class__s      C:\Users\Dell Inspiron 16\Desktop\tws\AgrotaPowerBi\back-agrota-powerbi\mcp-client-agrota\venv\Lib\site-packages\langchain_text_splitters/markdown.pyr   zMarkdownTextSplitter.__init__   sA    55h6GHH
99J9&99999    )r   r   r   r   )__name__
__module____qualname____doc__r   __classcell__)r   s   @r   r   r      sC        GG: : : : : : : : : :r   r   c                  :    e Zd ZdZ	 	 	 dddZddZddZddZdS )MarkdownHeaderTextSplitterz4Splitting markdown files based on specified headers.FTNheaders_to_split_onlist[tuple[str, str]]return_each_lineboolstrip_headerscustom_header_patternsdict[str, int] | Noner   r   c                d    || _         t          |d d          | _        || _        |pi | _        dS )a  Create a new `MarkdownHeaderTextSplitter`.

        Args:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
            strip_headers: Strip split headers from the content of the chunk
            custom_header_patterns: Optional dict mapping header patterns to their
                levels.

                For example: `{"**": 1, "***": 2}` to treat `**Header**` as level 1 and
                `***Header***` as level 2 headers.
        c                ,    t          | d                   S )Nr   )len)splits    r   <lambda>z5MarkdownHeaderTextSplitter.__init__.<locals>.<lambda>2   s    3uQx== r   T)keyreverseN)r#   sortedr!   r%   r&   )r   r!   r#   r%   r&   s        r   r   z#MarkdownHeaderTextSplitter.__init__   sM    ( !1 $*%@%@$$
 $
 $
  +&<&B###r   linestrsepc           	     F   | j         vrdS t          j                  }d| d| d| d| d	}t          j        ||          }|rZ|                    d                                          }|r1t          fd|                    d	d
          D                       sdS dS )zCheck if line matches a custom header pattern.

        Args:
            line: The line to check
            sep: The separator pattern to match

        Returns:
            `True` if the line matches the custom pattern format
        F^z(?!z
)(.+?)(?<!)$   c              3      K   | ]}|v V  	d S Nr   ).0cr2   s     r   	<genexpr>z?MarkdownHeaderTextSplitter._is_custom_header.<locals>.<genexpr>T   s'      "N"N18"N"N"N"N"N"Nr     T)r&   reescapematchgroupstripallreplace)r   r0   r2   escaped_seppatternrA   contents     `    r   _is_custom_headerz,MarkdownHeaderTextSplitter._is_custom_header9   s     d1115 inn TSSSS{SS[SSS 	 $'' 	kk!nn**,,G  s"N"N"N"NW__S"5M5M"N"N"NNN tur   lineslist[LineType]list[Document]c                4   g }|D ]}|r8|d         d         |d         k    r |d         dxx         d|d         z   z  cc<   =|r|d         d         |d         k    rt          |d         d                   t          |d                   k     ri|d         d                             d          d         d         dk    r8| j        s1|d         dxx         d|d         z   z  cc<   |d         |d         d<   |                    |           	d |D             S )	zCombine lines with common metadata into chunks.

        Args:
            lines: Line of text / associated header metadata

        Returns:
            List of `Document` objects with common metadata aggregated.
        metadatarH   z  

r   #c                H    g | ]}t          |d          |d                    S rH   rO   page_contentrO   r   r:   chunks     r   
<listcomp>zHMarkdownHeaderTextSplitter.aggregate_lines_to_chunks.<locals>.<listcomp>   ?     
 
 
 %	"2U:=NOOO
 
 
r   )r*   r+   r%   append)r   rJ   aggregated_chunksr0   s       r   aggregate_lines_to_chunksz4MarkdownHeaderTextSplitter.aggregate_lines_to_chunksX   sn    -/ 	/ 	/D!/%b)*5j9III
 ""%i000FT)_4LL0000!/%b)*5j9III)"-j9::SjAQ=R=RRR%b))4::4@@DQG3NN* O ""%i000FT)_4LL000484D!"%j11 "((....
 
*
 
 
 	
r   textc                   |                     d          }g }g }i }g }i }d}d}	|D ]E}
|
                                }d                    t          t          j        |                    }|sM|                    d          r|                    d          dk    rd}d}	n3|                    d          rd}d}	n|                    |	          rd}d}	|r|                    |           | j	        D ]\  }}|                    |          o8t          |          t          |          k    p|t          |                   dk    }|                     ||          }|s|r||| j        v r| j        |         }n|                    d
          }|r_|d         d         |k    rM|                                }|d         |v r|                    |d                    |r|d         d         |k    M|r8|t          |          t          |                                                    }n)|t          |          d	                                         }|||d}|                    |           |d         ||<   |rQ|                    d                    |          |                                d           |                                 | j        s|                    |            nm|r|                    |           nS|rQ|                    d                    |          |                                d           |                                 |                                }G|r+|                    d                    |          |d           | j        s|                     |          S d |D             S )zSplit markdown file.

        Args:
            text: Markdown file

        Returns:
            List of `Document` objects.
        rP   Fr>   z```r7   Tz~~~r=   NrQ   rN   levelname)r_   r`   datara   )rH   rO   c                H    g | ]}t          |d          |d                    S rS   r   rV   s     r   rX   z9MarkdownHeaderTextSplitter.split_text.<locals>.<listcomp>  rY   r   )r+   rC   joinfilterr1   isprintable
startswithcountrZ   r!   r*   rI   r&   popcopyclearr%   r#   r\   )r   r]   rJ   lines_with_metadatacurrent_contentcurrent_metadataheader_stackinitial_metadatain_code_blockopening_fencer0   stripped_liner2   r`   is_standard_headeris_custom_headercurrent_header_levelpopped_headerheader_textheaders                       r   
split_textz%MarkdownHeaderTextSplitter.split_text   s    

4   /1 &(+- *,+- d	7 d	7D JJLLM GGF3?M$J$JKKM  
# ++E22 *}7J7J57Q7QUV7V7V$(M$)MM"--e44 *$(M$)M))-88 # % " &&}555 "5 L, L,	T%2%=%=c%B%B & &&#c((2TmCHH6MQT6T #
 $(#9#9-#M#M  & 8)9 8'$"===373Ns3S003699S>>0 )
L ,R 0 9=Q Q Q -9,<,<,>,>M  -V48HHH 0 4 4]65J K K K )
L ,R 0 9=Q Q Q , L +8CCHH98L*M*S*S*U*UKK +8C

*C*I*I*K*KK &:$($/. .
 %++F33317(. ' 0+22+/99_+E+E,<,A,A,C,C    (--///- >'..}===Eq8t ! 	,#**=9999$ ,'..'+yy'A'A(8(=(=(?(?    $))+++/4466 	&&#yy99 0    $ 	G112EFFF
 
,
 
 
 	
r   )FTN)
r!   r"   r#   r$   r%   r$   r&   r'   r   r   )r0   r1   r2   r1   r   r$   )rJ   rK   r   rL   r]   r1   r   rL   )r   r   r   r   r   rI   r\   ry   r   r   r   r    r       s        >>
 "'"8<C C C C C>   >,
 ,
 ,
 ,
\R
 R
 R
 R
 R
 R
r   r    c                  (    e Zd ZU dZded<   ded<   dS )LineTypezLine type as `TypedDict`.zdict[str, str]rO   r1   rH   Nr   r   r   r   __annotations__r   r   r   r|   r|     s+         ##LLLLLr   r|   c                  2    e Zd ZU dZded<   ded<   ded<   dS )
HeaderTypezHeader type as `TypedDict`.intr_   r1   r`   ra   Nr}   r   r   r   r   r   "  s1         %%JJJIIIIIIIIr   r   c                  z    e Zd ZdZ	 	 	 dd dZd!dZd"dZd#dZd$dZd%dZ	e
d%d            Ze
d%d            ZdS )&&ExperimentalMarkdownSyntaxTextSplittera  An experimental text splitter for handling Markdown syntax.

    This splitter aims to retain the exact whitespace of the original text while
    extracting structured metadata, such as headers. It is a re-implementation of the
    `MarkdownHeaderTextSplitter` with notable changes to the approach and additional
    features.

    Key Features:

    * Retains the original whitespace and formatting of the Markdown text.
    * Extracts headers, code blocks, and horizontal rules as metadata.
    * Splits out code blocks and includes the language in the "Code" metadata key.
    * Splits text on horizontal rules (`---`) as well.
    * Defaults to sensible splitting behavior, which can be overridden using the
        `headers_to_split_on` parameter.

    Example:
        ```python
        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
        ]
        splitter = ExperimentalMarkdownSyntaxTextSplitter(
            headers_to_split_on=headers_to_split_on
        )
        chunks = splitter.split(text)
        for chunk in chunks:
            print(chunk)
        ```

    This class is currently experimental and subject to change based on feedback and
    further development.
    NFTr!   list[tuple[str, str]] | Noner#   r$   r%   r   r   c                    g | _         t          d          | _        g | _        || _        |rt          |          | _        nddddddd	| _        || _        d
S )a  Initialize the text splitter with header splitting and formatting options.

        This constructor sets up the required configuration for splitting text into
        chunks based on specified headers and formatting preferences.

        Args:
            headers_to_split_on: A list of tuples, where each tuple contains a header
                tag (e.g., "h1") and its corresponding metadata key.

                If `None`, default headers are used.
            return_each_line: Whether to return each line as an individual chunk.

                Defaults to `False`, which aggregates lines into larger chunks.
            strip_headers: Whether to exclude headers from the resulting chunks.
        r>   rU   zHeader 1zHeader 2zHeader 3zHeader 4zHeader 5zHeader 6)rQ   z##z###z####z#####z######N)chunksr   current_chunkcurrent_header_stackr%   dictsplittable_headersr#   )r   r!   r#   r%   s       r   r   z/ExperimentalMarkdownSyntaxTextSplitter.__init__M  s}    * ')%2666;=!* 
	&*+>&?&?D##   !"#$' 'D# !1r   r]   r1   rL   c                   | j                                          t          d          | _        | j                                         |                    d          }|rp|                    d          }|                     |          }|                     |          }| 	                    |          }|r~| 
                                 | j        s| j        xj        |z  c_        t          |                    d                    }|                    d          }|                     ||           n|rk| 
                                 |                     ||          | j        _        |                    d          | j        j        d<   | 
                                 n,|r| 
                                 n| j        xj        |z  c_        |p| 
                                 | j        rd	 | j         D             S | j         S )
a[  Split the input text into structured chunks.

        This method processes the input text line by line, identifying and handling
        specific patterns such as headers, code blocks, and horizontal rules to split it
        into structured chunks based on headers, code blocks, and horizontal rules.

        Args:
            text: The input text to be split into chunks.

        Returns:
            A list of `Document` objects representing the structured
            chunks of the input text. If `return_each_line` is enabled, each line
            is returned as a separate `Document`.
        r>   r   T)keependsr   r7      Codec                    g | ]J}|j                                         D ].}||                                t          ||j                   /KS )rT   )rU   
splitlinesisspacer   rO   )r:   rW   r0   s      r   rX   zEExperimentalMarkdownSyntaxTextSplitter.split_text.<locals>.<listcomp>  su       !.99;;  	 !%	dU^DDD   r   )r   rj   r   r   r   r   rh   _match_header_match_code_match_horz_complete_chunk_docr%   rU   r*   rB   _resolve_header_stack_resolve_code_chunkrO   r#   )	r   r]   	raw_linesraw_lineheader_match
code_match
horz_matchheader_depthrw   s	            r   ry   z1ExperimentalMarkdownSyntaxTextSplitter.split_textt  s'     	%2666!'')))OOTO22	 	< }}Q''H--h77L))(33J))(33J <((***) @&33x?33  #<#5#5a#8#899*0033**<EEEE 
<((***262J2Ji3 3"/ 7A6F6Fq6I6I"+F3((**** <((****"//8;//3  	<6 	  """   	 ![    {r   r   r   rw   c                    t          | j                  D ]$\  }\  }}||k    r| j        d |         | _         n%| j                            ||f           d S r9   )	enumerater   rZ   )r   r   rw   idepth_s         r   r   z<ExperimentalMarkdownSyntaxTextSplitter._resolve_header_stack  sq    &t'@AA 	 	MAzq$$,0,Ebqb,I) % 	!((,)DEEEEEr   current_liner   	list[str]c                t    |}|r3|                     d          }||z  }|                     |          r|S |3dS )Nr   r>   )rh   r   )r   r   r   rW   r   s        r   r   z:ExperimentalMarkdownSyntaxTextSplitter._resolve_code_chunk  sV     	 }}Q''HXE)) 	  	
 rr   c                $   | j         j        }|rl|                                sX| j        D ]1\  }}| j                            d|z            }|| j         j        |<   2| j                            | j                    t          d          | _         d S )NrQ   r>   r   )
r   rU   r   r   r   getrO   r   rZ   r   )r   chunk_contentr   value
header_keys        r   r   z:ExperimentalMarkdownSyntaxTextSplitter._complete_chunk_doc  s    *7 	3!6!6!8!8 	3 $ 9 @ @u!488uEE
:?"+J77Kt1222%2666r   r0   re.Match[str] | Nonec                p    t          j        d|          }|r|                    d          | j        v r|S d S )Nz^(#{1,6}) (.*)r7   )r?   rA   rB   r   )r   r0   rA   s      r   r   z4ExperimentalMarkdownSyntaxTextSplitter._match_header  s=    *D11 	U[[^^t'>>>Ltr   c                T      fddD             }t          d |D             d           S )Nc                :    g | ]}t          j        |          S r   r?   rA   r:   ruler0   s     r   rX   zFExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<listcomp>  s%    OOOD28D$''OOOr   )z^```(.*)z^~~~(.*)c              3     K   | ]}||V  	d S r9   r   r:   rA   s     r   r<   zEExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<genexpr>  '      99u59U999999r   nextr0   matchess   ` r   r   z2ExperimentalMarkdownSyntaxTextSplitter._match_code  s<    OOOO4NOOO999994@@@r   c                T      fddD             }t          d |D             d           S )Nc                :    g | ]}t          j        |          S r   r   r   s     r   rX   zFExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<listcomp>  s2     
 
 
%)BHT4  
 
 
r   )z
^\*\*\*+\nz^---+\nz^___+\nc              3     K   | ]}||V  	d S r9   r   r   s     r   r<   zEExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<genexpr>  r   r   r   r   s   ` r   r   z2ExperimentalMarkdownSyntaxTextSplitter._match_horz  sL    
 
 
 
-T
 
 
 999994@@@r   )NFT)r!   r   r#   r$   r%   r$   r   r   rz   )r   r   rw   r1   r   r   )r   r1   r   r   r   r1   )r   r   )r0   r1   r   r   )r   r   r   r   r   ry   r   r   r   r   staticmethodr   r   r   r   r   r   r   *  s           H =A!&"	%1 %1 %1 %1 %1N< < < <|F F F F   
7 
7 
7 
7    A A A \A A A A \A A Ar   r   )r   
__future__r   r?   typingr   r   langchain_core.documentsr   langchain_text_splitters.baser   "langchain_text_splitters.characterr	   r   r    r|   r   r   r   r   r   <module>r      sj     " " " " " " 				 ! ! ! ! ! ! ! ! - - - - - - 2 2 2 2 2 2 M M M M M M: : : : :9 : : :A
 A
 A
 A
 A
 A
 A
 A
H    y          wA wA wA wA wA wA wA wA wA wAr   