
    [ǻi>	                    v    d Z ddlmZ ddlmZ ddlmZ ddlmZ 	 ddl	Z	dZ
 G d	 d
e      Zy# e$ r dZ
Y w xY w)zNLTK text splitter.    )annotations)Any)override)TextSplitterNTFc                  T     e Zd ZdZ	 	 ddd	 	 	 	 	 	 	 	 	 d fdZedd       Z xZS )	NLTKTextSplitterz"Splitting text using NLTK package.F)use_span_tokenizec               r   t        |   di | || _        || _        || _        | j                  r| j                  rd}t        |      t        sd}t        |      | j                  r/t        j                  j                  | j                        | _        yt        j                  j                  | _        y)a  Initialize the NLTK splitter.

        Args:
            separator: The separator to use when combining splits.
            language: The language to use.
            use_span_tokenize: Whether to use `span_tokenize` instead of
                `sent_tokenize`.

        Raises:
            ImportError: If NLTK is not installed.
            ValueError: If `use_span_tokenize` is `True` and separator is not `''`.
        z6When use_span_tokenize is True, separator should be ''zANLTK is not installed, please install it with `pip install nltk`.N )super__init__
_separator	_language_use_span_tokenize
ValueError	_HAS_NLTKImportErrornltktokenize_get_punkt_tokenizer
_tokenizersent_tokenize)self	separatorlanguager	   kwargsmsg	__class__s         [/opt/lhia/marcimex/agent/venv/lib/python3.12/site-packages/langchain_text_splitters/nltk.pyr   zNLTKTextSplitter.__init__   s    ( 	"6"#!"3""tJCS/!UCc"""""mm@@PDO"mm99DO    c                j   | j                   rot        | j                  j                  |            }g }t	        |      D ]:  \  }\  }}|dkD  r||dz
     d   }||| ||| z   }n||| }|j                  |       < n| j                  || j                        }| j                  || j                        S )Nr      )r   )	r   listr   span_tokenize	enumerateappendr   _merge_splitsr   )	r   textspanssplitsistartendprev_endsentences	            r   
split_textzNLTKTextSplitter.split_text9   s     ""66t<=EF#,U#3 (<E3q5$QU|AH#HU3d5oEH#E#Hh'( __TDNN_CF!!&$//::r    )z

english)
r   strr   r2   r	   boolr   r   returnNone)r(   r2   r4   z	list[str])__name__
__module____qualname____doc__r   r   r0   __classcell__)r   s   @r   r   r      sb    ,  !!:
 #(!:!: !:
  !: !: 
!:F ; ;r    r   )r9   
__future__r   typingr   typing_extensionsr   langchain_text_splitters.baser   r   r   r   r   r   r    r   <module>r?      sC     "  & 6I
5;| 5;	  Is   . 88