
    ~
i=                         d dl Z d dlmZmZmZmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ  G d d	e          ZdS )
    N)AnyAsyncIteratorIteratorListOptionalSetUnion)urlparse)BeautifulSoup)Document)
BaseLoader)WebBaseLoaderc                      e Zd ZdZ	 	 	 	 	 d%ddddeded	ee         d
edededee         deee                  fdZdedefdZ		 d&de
e         dededefdZdeee
e         f         defdZdedefdZdede
e         fdZ	 d'dedee         dee         de
e         fdZ	 d'ded	edee         dee         de
e         f
dZdee         fd Zdee         fd!Z	 d'ded"ee         dee         fd#Zdede
e         fd$ZdS )(GitbookLoadera   Load `GitBook` data.

    1. load from either a single page, or
    2. load all (relative) paths in the sitemap, handling nested sitemap indexes.

    When `load_all_paths=True`, the loader parses XML sitemaps and requires the
    `lxml` package to be installed (`pip install lxml`).
    FNmainT)sitemap_urlallowed_domainsweb_pageload_all_pathsbase_urlcontent_selectorcontinue_on_failureshow_progressr   r   c                   |p|| _         | j                             d          r| j         dd         | _         || _        || _        || _        || _        || _        || _        | j        t          |          j	        }	|	r|	h| _        |r|p	| j          d| _
        n|| _
        |                     | j
                  st          d| j
         d| j                   dS )aT  Initialize with web page and whether to load all paths.

        Args:
            web_page: The web page to load or the starting point from where
                relative paths are discovered.
            load_all_paths: If set to True, all relative paths in the navbar
                are loaded instead of only `web_page`. Requires `lxml` package.
            base_url: If `load_all_paths` is True, the relative paths are
                appended to this base url. Defaults to `web_page`.
            content_selector: The CSS selector for the content to load.
                Defaults to "main".
            continue_on_failure: whether to continue loading the sitemap if an error
                occurs loading a url, emitting a warning instead of raising an
                exception. Setting this to True makes the loader more robust, but also
                may result in missing data. Default: False
            show_progress: whether to show a progress bar while loading. Default: True
            sitemap_url: Custom sitemap URL to use when load_all_paths is True.
                Defaults to "{base_url}/sitemap.xml".
            allowed_domains: Optional set of allowed domains to fetch from.
                If None (default), the loader will restrict crawling to the domain
                of the `web_page` URL to prevent potential SSRF vulnerabilities.
                Provide an explicit set (e.g., {"example.com", "docs.example.com"})
                to allow crawling across multiple domains. Use with caution in
                server environments where users might control the input URLs.
        /Nz/sitemap.xmlz
Domain in z% is not in the allowed domains list: )r   endswithr   r   r   r   r   r   r
   netloc	start_url_is_url_allowed
ValueError)
selfr   r   r   r   r   r   r   r   initial_domains
             C:\Users\Dell Inspiron 16\Desktop\tws\AgrotaPowerBi\back-agrota-powerbi\mcp-client-agrota\venv\Lib\site-packages\langchain_community/document_loaders/gitbook.py__init__zGitbookLoader.__init__   s   J !,H=!!#&& 	/ M#2#.DM , 0#6 *. '%h//6N 8(6'7$  	&(Jt},J,J,JDNN%DN ##DN33 	*T^ * *'* *  	 	    urlreturnc                     | j         dS 	 t          |          }|j        dvrdS |j        sdS |j        | j         v S # t          $ r Y dS w xY w)z0Check if a URL has an allowed scheme and domain.NF)httphttps)r   r
   schemer   	Exception)r"   r'   parseds      r$   r    zGitbookLoader._is_url_allowedY   sz    
 '5	c]]F }$555u = u=D$888 	 	 	55	s   < < < 
A
	A
URLurl_listurl_typec                     |                      |          r|                    |           dS t          j        d| d|            dS )aB  Safely add a URL to a list if it's from an allowed domain.

        Args:
            url_list: The list to add the URL to
            url: The URL to add
            url_type: Type of URL for warning message (e.g., "sitemap", "content")

        Returns:
            bool: True if URL was added, False if skipped
        TzSkipping disallowed z URL: F)r    appendwarningswarn)r"   r0   r'   r1   s       r$   _safe_add_urlzGitbookLoader._safe_add_urlp   sW     $$ 	OOC   4MFFFFFGGG5r&   url_or_urlsc                 :    t          || j        | j                  S )zCreate a new WebBaseLoader instance for the given URL(s).

        This ensures each operation gets its own isolated WebBaseLoader.
        )web_pathr   r   )r   r   r   )r"   r7   s     r$   _create_web_loaderz GitbookLoader._create_web_loader   s*    
   $ 8,
 
 
 	
r&   soupc                 0    |                     d          duS )z+Check if the soup contains a sitemap index.sitemapindexN)find)r"   r;   s     r$   _is_sitemap_indexzGitbookLoader._is_sitemap_index   s    yy((44r&   c                     |                     d          }g }|D ]<}|                    d          }|r#|j        r|                     ||j        d           =|S )z*Extract sitemap URLs from a sitemap index.sitemaploc)find_allr>   textr6   )r"   r;   sitemap_tagsurlsrA   rB   s         r$   _extract_sitemap_urlsz#GitbookLoader._extract_sitemap_urls   si    }}Y//# 	> 	>G,,u%%C >sx >""49===r&   processed_urls
web_loaderc                 R   ||                      | j                  }|                     |          r|                     |          }g }|D ]}||v rt	          j        d|            |                    |           	 |j        }|g|_        |                    d          }||_        | 	                    |||          }	|
                    |	           # t          $ r-}
| j        rt	          j        d| d|
            n Y d}
~
d}
~
ww xY w|S |                     |          S )aO  Process a sitemap, handling both direct content URLs and sitemap indexes.

        Args:
            soup: The BeautifulSoup object of the sitemap
            processed_urls: Set of already processed URLs to avoid cycles
            web_loader: WebBaseLoader instance to reuse for all requests,
                created if None
        Nz(Skipping already processed sitemap URL: lxml-xmlparserError processing sitemap : )r:   r   r?   rG   r4   r5   add	web_pathsscrape_process_sitemapextendr-   r   
_get_paths)r"   r;   rH   rI   sitemap_urlsall_content_urlsr   original_web_pathssitemap_soupcontent_urlses              r$   rS   zGitbookLoader._process_sitemap   s    00@@J !!$'' &	)55d;;L!+  .00MP;PP   "";///)3)=&,7=J( $.#4#4J#4#G#GL ,>J( $(#8#8$nj$ $L %++L9999    /  &T+&T&TQR&T&TUUUU VUUUU $# ??4(((s   ?AC
D"#D

Dc                   K   ||                      | j                  }|                     |          r|                     |          }g }fd|D             }|sg S |j        }||_        |                    |d           d{V }	||_        t          ||	          D ]\  }
}                    |
           	 |                     |||           d{V }|	                    |           O# t          $ r-}| j        rt          j        d|
 d|            n Y d}~d}~ww xY w|S |                     |          S )a^  Async version of _process_sitemap.

        Args:
            soup: The BeautifulSoup object of the sitemap
            base_url: The base URL for relative paths
            processed_urls: Set of already processed URLs to avoid cycles
            web_loader: WebBaseLoader instance to reuse for all requests,
                created if None
        Nc                     g | ]}|v|	S  r^   ).0r'   rH   s     r$   
<listcomp>z3GitbookLoader._aprocess_sitemap.<locals>.<listcomp>   s#    QQQs.7P7P7P7P7Pr&   rK   rL   rN   rO   )r:   r   r?   rG   rQ   ascrape_allziprP   _aprocess_sitemaprT   r-   r   r4   r5   rU   )r"   r;   r   rH   rI   rV   rW   new_urlsrX   soupsr   rY   rZ   r[   s      `          r$   rc   zGitbookLoader._aprocess_sitemap   s     " 00@@J !!$'' &	)55d;;L! RQQQ|QQQH 	 ",!5#+J  %00*0MMMMMMMME $6J -05-A-A  )\"";///
)-)?)?$h
* * $ $ $ $ $ $L %++L9999    /  &T+&T&TQR&T&TUUUU VUUUU $# ??4(((s   ;3C//
D&9#D!!D&c              #     K   | j         sS|                     | j                  }|                                }|                     || j                  }|r|V  dS dS |                     | j                  }|                    d          }t                      }|                     ||          }|s#| j        rt          j
        d| j                    g }|D ]}|                     ||d           |sdS |                     |          }	|	                    |          }
t          |
|          D ]!\  }}|                     ||          }|r|V  "dS )zDFetch text from one single GitBook page or recursively from sitemap.rK   rL   $No content URLs found in sitemap at contentN)r   r:   r   rR   _get_documentr   setrS   r   r4   r5   r6   
scrape_allrb   )r"   temp_loaderr;   doc	soup_inforH   relative_pathsrF   r'   content_loader
soup_infoss              r$   	lazy_loadzGitbookLoader.lazy_load  s     " &	11$-@@K%%''D$$T4=99C 					  11$.AAK#****==I (+uuN!229nMMN! Wd&8 WUT^UUVVV !D% 9 9""4i8888  "44T::N (22488J"%j$"7"7  	3((C88 III r&   c                (  K   | j         si|                     | j                  }|                    | j        g           d{V }|d         }|                     || j                  }|r|W V  dS dS |                     | j                  }|                    | j        gd           d{V }|d         }t                      }|                     || j        |           d{V }|s#| j	        rt          j        d| j                    g }|D ]}|                     ||d           |sdS |                     |          }	|	                    |           d{V }
t          |
|          D ]"\  }}|                     ||          }||W V  #dS )z/Asynchronously fetch text from GitBook page(s).Nr   rK   rL   rg   rh   )r   r:   r   ra   ri   r   rj   rc   r   r   r4   r5   r6   rb   )r"   rl   re   rn   rm   rH   ro   rF   r'   rp   rq   	maybe_docs               r$   
alazy_loadzGitbookLoader.alazy_load?  s     " *	$11$-@@K%114=/BBBBBBBBEaI$$Y>>C 						  11$.AAK%114>2B:1VVVVVVVVEaI (+uuN#'#9#94=.$ $      N " Wd&8 WUT^UUVVV !D% 9 9""4i8888  "44T::N  .99$????????J"%j$"7"7 $ $	3 ..y#>>	(#OOOO$ $r&   
custom_urlc                    |                     | j                  }|sdS |                    d                                          }|                     d          }|r|j        nd}|p| j        |d}t          ||          S )z,Fetch content from page and return Document.N
)	separatorh1 )sourcetitle)page_contentmetadata)r>   r   get_textstriprD   r   r   )r"   r;   rv   page_content_rawrh   title_if_existsr}   r   s           r$   ri   zGitbookLoader._get_documentm  s      99T%:;; 	4"++d+;;AACC*//55(7?$$R(9DMEJJWx@@@@r&   c                 |    g }|                     d          D ]#}|j        r|                    |j                   $|S )zFetch all URLs in the sitemap.rB   )rC   rD   r3   )r"   r;   rF   rB   s       r$   rU   zGitbookLoader._get_pathsz  sH    =='' 	& 	&Cx & CH%%%r&   )FNr   FT)r/   )N)__name__
__module____qualname____doc__strboolr   r   r%   r    r   r6   r	   r   r:   r   r?   rG   rS   rc   r   r   rr   r   ru   r   ri   rU   r^   r&   r$   r   r      s          %"& &$)"A &*.2A A AA A 3-	A
 A "A A c]A "#c(+A A A AF3 4    0 >C S	(+7:	   (	
eCcN.C 	
 	
 	
 	
 	
5m 5 5 5 5 5- DI     /3	9) 9)9) C9) ]+	9)
 
c9) 9) 9) 9)@ /3;) ;);) ;) C	;)
 ]+;) 
c;) ;) ;) ;)z(8H- ( ( ( (T,$-"9 ,$ ,$ ,$ ,$^ 6:A AA%-c]A	(	A A A As tCy      r&   r   )r4   typingr   r   r   r   r   r   r	   urllib.parser
   bs4r   langchain_core.documentsr   )langchain_community.document_loaders.baser   -langchain_community.document_loaders.web_baser   r   r^   r&   r$   <module>r      s     K K K K K K K K K K K K K K K K K K ! ! ! ! ! !       - - - - - - @ @ @ @ @ @ G G G G G Gv v v v vJ v v v v vr&   