
    ~
i1                         d dl Z d dlmZ d dlmZmZmZmZmZ d dl	m
Z
 d dlmZ  e j        e          Zerd dlZ G d de          ZdS )    N)Path)TYPE_CHECKINGIteratorOptionalSequenceUnion)Document)
BaseLoaderc                       e Zd ZdZ	 	 	 	 ddeeef         dee         deee	                  d	ee
         d
ee
         f
dZddZdddefdZdee         fdZdS )MWDumpLoadera  Load `MediaWiki` dump from an `XML` file.

    Example:
        .. code-block:: python

            from langchain_text_splitters import RecursiveCharacterTextSplitter
            from langchain_community.document_loaders import MWDumpLoader

            loader = MWDumpLoader(
                file_path="myWiki.xml",
                encoding="utf8"
            )
            docs = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=0
            )
            texts = text_splitter.split_documents(docs)


    :param file_path: XML local file path
    :type file_path: str
    :param encoding: Charset encoding, defaults to "utf8"
    :type encoding: str, optional
    :param namespaces: The namespace of pages you want to parse.
        See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
        for a list of all common namespaces
    :type namespaces: List[int],optional
    :param skip_redirects: TR=rue to skip pages that redirect to other pages,
        False to keep them. False by default
    :type skip_redirects: bool, optional
    :param stop_on_error: False to skip over pages that cause parsing errors,
        True to stop. True by default
    :type stop_on_error: bool, optional
    utf8NFT	file_pathencoding
namespacesskip_redirectsstop_on_errorc                     t          |t                    r|nt          |          | _        || _        || _        || _        || _        d S )N)
isinstancestrr   r   r   r   r   )selfr   r   r   r   r   s         C:\Users\Dell Inspiron 16\Desktop\tws\AgrotaPowerBi\back-agrota-powerbi\mcp-client-agrota\venv\Lib\site-packages\langchain_community/document_loaders/mediawikidump.py__init__zMWDumpLoader.__init__3   sH     '1C&@&@Tc)nn $,*    return
mwxml.Dumpc                     	 dd l }n"# t          $ r}t          d          |d }~ww xY w|j                            t	          | j        | j                            S )Nr   zBUnable to import 'mwxml'. Please install with `pip install mwxml`.)r   )mwxmlImportErrorDump	from_fileopenr   r   )r   r   es      r   _load_dump_filezMWDumpLoader._load_dump_fileB   sr    	LLLL 	 	 	T 	
 z##D$-$P$P$PQQQ    
&!&pagez
mwxml.Pagec                     	 ddl }n"# t          $ r}t          d          |d}~ww xY w|D ]O}|                    |j                  }|                    ddd          }d|j        i}t          ||          c S dS )	zParse a single page.r   NzXUnable to import 'mwparserfromhell'. Please install with `pip install mwparserfromhell`.TF)	normalizecollapsekeep_template_paramssource)page_contentmetadata)mwparserfromhellr   parsetext
strip_codetitler	   )r   r%   r-   r"   revisioncoder/   r,   s           r   _load_single_page_from_dumpz(MWDumpLoader._load_single_page_from_dumpL   s    	##### 	 	 	3  	
  	B 	BH#))(-88D??E #  D !$*-HAAAAAA	B 	Br$   c              #   Z  K   |                                  }|j        D ]}| j        r|j        r| j        r|j        | j        vr'	 |                     |          V  @# t          $ r@}t          	                    d
                    |                     | j        r|Y d}~d}~ww xY wdS )zLazy load from a file path.zParsing error: {}N)r#   pagesr   redirectr   	namespacer4   	Exceptionloggererrorformatr   )r   dumpr%   r"   s       r   	lazy_loadzMWDumpLoader.lazy_load]   s      
 ##%%J 	 	D" t}  4>#H#H66t<<<<<<   077::;;;% GHHHH	 	s   A
B((6B##B()r   NFT)r   r   )__name__
__module____qualname____doc__r   r   r   r   r   intboolr   r#   r	   r4   r   r>    r   r   r   r      s        ! !L #).2).(,+ +d#+ 3-+ Xc]+	+
 !+  ~+ + + +R R R RB B B B B B"	(	     r   r   )loggingpathlibr   typingr   r   r   r   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   	getLoggerr?   r:   r   r   rE   r   r   <module>rL      s           E E E E E E E E E E E E E E - - - - - - @ @ @ @ @ @		8	$	$ LLLa a a a a: a a a a ar   