
    Ng(                     $   d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ  G d	 d
e          Zedk    rC eddd          Ze                                Z ed ee           dej                    dS dS )    N)Path)AnyListOptionalTuple)unquote)Document)DirectoryLoader)PyPDFLoader)WebBaseLoaderc                   \    e Zd ZdZ	 	 	 	 	 ddedededeeeef                  d	ee         d
edef fdZ	ddZ
dee         fdZdedefdZdedee         fdZdedee         fdZdee         ddfdZdee         fdZdedee         fdZdeddfdZdedefdZdedefdZ xZS )BlackboardLoadera'  Load a `Blackboard` course.

    This loader is not compatible with all Blackboard courses. It is only
    compatible with courses that use the new Blackboard interface.
    To use this loader, you must have the BbRouter cookie. You can get this
    cookie by logging into the course and then copying the value of the
    BbRouter cookie from the browser's developer tools.

    Example:
        .. code-block:: python

            from langchain_community.document_loaders import BlackboardLoader

            loader = BlackboardLoader(
                blackboard_course_url="https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1",
                bbrouter="expires:12345...",
            )
            documents = loader.load()

    TNFblackboard_course_urlbbrouterload_all_recursively
basic_authcookiescontinue_on_failureshow_progressc                    t                                          |||           	 |                    d          d         | _        n# t          $ r t	          d          w xY w||| j        _        |i }|                    d|i           | j        j                            |           || _	        | 
                                 dS )aY  Initialize with blackboard course url.

        The BbRouter cookie is required for most blackboard courses.

        Args:
            blackboard_course_url: Blackboard course url.
            bbrouter: BbRouter cookie.
            load_all_recursively: If True, load all documents recursively.
            basic_auth: Basic auth credentials.
            cookies: Cookies.
            continue_on_failure: whether to continue loading the sitemap if an error
                occurs loading a url, emitting a warning instead of raising an
                exception. Setting this to True makes the loader more robust, but also
                may result in missing data. Default: False
            show_progress: whether to show a progress bar while loading. Default: True

        Raises:
            ValueError: If blackboard course url is invalid.
        )	web_pathsr   r   z/webapps/blackboardr   zpInvalid blackboard course url. Please provide a url that starts with https://<blackboard_url>/webapps/blackboardNBbRouter)super__init__splitbase_url
IndexErrorsessionauthupdater   r   	check_bs4)	selfr   r   r   r   r   r   r   	__class__s	           k/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/blackboard.pyr   zBlackboardLoader.__init__$   s    : 	, 3' 	 	
 	
 	
	1778MNNqQDMM 	 	 	>  	 ! *DL?G
H-...##G,,,$8!s    A A"returnc                 J    	 ddl }dS # t          $ r t          d          w xY w)z|Check if BeautifulSoup4 is installed.

        Raises:
            ImportError: If BeautifulSoup4 is not installed.
        r   NzeBeautifulSoup4 is required for BlackboardLoader. Please install it with `pip install beautifulsoup4`.)bs4ImportError)r"   r'   s     r$   r!   zBlackboardLoader.check_bs4Y   sG    	JJJJJ 	 	 	G  	s    "c                 p   | j         r|                                 }|                     |          | _        |                     |          }g }|D ]}| j        |z   }t          d|            |                     |          }t          j	        t                    5  |                    |                     |                     ddd           n# 1 swxY w Y   |S t          d| j                    |                                 }|                     |          | _        |                     |          S )zZLoad data into Document objects.

        Returns:
            List of Documents.
        zFetching documents from N)r   scrape_get_folder_pathfolder_path
_get_pathsr   print_scrape
contextlibsuppress
ValueErrorextend_get_documentsweb_path)r"   	soup_inforelative_paths	documentspathurls         r$   loadzBlackboardLoader.loadg   s}    $ 	2I#44Y??D!__Y77NI& E Emd*666777 LL--	(44 E E$$T%8%8%C%CDDDE E E E E E E E E E E E E E E<T]<<===I#44Y??D&&y111s   )CC	C	soupc                 &   |                     dddi          }|t          d          |j                                        }t	          |                              dd                              dd                              d	d                              d
d                              dd                              dd                              dd                              dd          }t          d          |z  }t          |          S )zGet the folder path to save the Documents in.

        Args:
            soup: BeautifulSoup4 soup object.

        Returns:
            Folder path.
        spanidcrumb_1NzNo course name found. _/:,?'!".)findr2   textstripr   replacer   str)r"   r<   course_namecourse_name_cleanr,   s        r$   r+   z!BlackboardLoader._get_folder_path   s     iiy(9::4555!&,,.. K  WS#WS#WS#WS#WS#WS#WS#WS# 	 3ii"33;    c                     |                      |          }|                     |           |                                 }|S )zFetch content from page and return Documents.

        Args:
            soup: BeautifulSoup4 soup object.

        Returns:
            List of documents.
        )_get_attachments_download_attachments_load_documents)r"   r<   attachmentsr8   s       r$   r4   zBlackboardLoader._get_documents   sB     ++D11"";///((**	rR   c                 T   ddl m}m} |                    dddi          }|t	          d          g }|                    dddi          D ][}|                    d	          D ]C}|                    d
          }|*|                    d          s|                    |           D\|S )zGet all attachments from a page.

        Args:
            soup: BeautifulSoup4 soup object.

        Returns:
            List of attachments.
        r   )BeautifulSoupTagulclasscontentListNzNo content list found.rW   ahref#)	r'   rY   rZ   rK   r2   find_allget
startswithappend)	r"   r<   rY   rZ   content_listrW   
attachmentlinkr_   s	            r$   rT   z!BlackboardLoader._get_attachments   s     	+******* yy'?@@5666&//w6NOO 	- 	-J"++C00 - -xx''#DOOC,@,@#&&t,,,	-
 rR   rW   c                     t          | j                                      dd           |D ]}|                     |           dS )z_Download all attachments.

        Args:
            attachments: List of attachments.
        T)parentsexist_okN)r   r,   mkdirdownload)r"   rW   rf   s      r$   rU   z&BlackboardLoader._download_attachments   sV     	T$$TD$AAA% 	& 	&JMM*%%%%	& 	&rR   c                 f    t          | j        dt                    }|                                }|S )z[Load all documents in the folder.

        Returns:
            List of documents.
        z*.pdf)r9   glob
loader_cls)r
   r,   r   r;   )r"   loaderr8   s      r$   rV   z BlackboardLoader._load_documents   s9     !!"
 
 
 KKMM	rR   c                    g }|                     dddi          }|t          d          |                    d          D ]C}|                    d          }|*|                    d          r|                    |           D|S )	z%Get all relative paths in the navbar.r[   r\   
courseMenuNzNo course menu found.r^   r_   rC   )rK   r2   ra   rb   rc   rd   )r"   r<   r7   course_menurg   r_   s         r$   r-   zBlackboardLoader._get_paths   s    iiw&=>>4555((-- 	, 	,D88F##DDOOC$8$8%%d+++rR   r9   c                 2   | j                             | j        |z   d          }|                     |j                  }t          t          | j                  |z  d          5 }|                    |j	                   ddd           dS # 1 swxY w Y   dS )zXDownload a file from an url.

        Args:
            path: Path to the file.
        T)allow_redirectswbN)
r   rb   r   parse_filenamer:   openr   r,   writecontent)r"   r9   responsefilenamefs        r$   rl   zBlackboardLoader.download   s     <##DMD$8$#OO&&x|44$t'((83T:: 	&aGGH$%%%	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	& 	&s   $BBBr:   c                 r    t          |          x}r|j        dk    r|j        S |                     |          S )zParse the filename from an url.

        Args:
            url: Url to parse the filename from.

        Returns:
            The filename.
        .pdf)r   suffixname_parse_filename_from_url)r"   r:   url_paths      r$   rw   zBlackboardLoader.parse_filename   s?     S		!H 	6x&'@'@= 00555rR   c                 6   t          j        d|          }|r|                    d          }nt          d|           d|vrt          d|           |                    d          d         dz   }t          |          }|                    dd          }|S )	zParse the filename from an url.

        Args:
            url: Url to parse the filename from.

        Returns:
            The filename.

        Raises:
            ValueError: If the filename could not be parsed.
        zfilename%2A%3DUTF-8%27%27(.+)   zCould not parse filename from r   zIncorrect file type: r   z%20rA   )researchgroupr2   r   r   rN   )r"   r:   filename_matchesr|   s       r$   r   z)BlackboardLoader._parse_filename_from_url  s     9%EsKK 	E'--a00HHCcCCDDD!!?X??@@@>>&))!,v58$$##E3//rR   )TNNFT)r%   N)__name__
__module____qualname____doc__rO   boolr   r   dictr   r!   r   r	   r;   r   r+   r4   rT   rU   rV   r-   rl   rw   r   __classcell__)r#   s   @r$   r   r      s%        2 &*04"&$)"3 3"3 3 #	3
 U38_-3 $3 "3 3 3 3 3 3 3j   2d8n 2 2 2 20 S  S        <3 4>    S T#Y    8
&c 
&t 
& 
& 
& 
&h    "
s 
tCy 
 
 
 
&S &T & & & &6# 6# 6 6 6 6C C        rR   r   __main__zhttps://<YOUR BLACKBOARD URL HERE>/webapps/blackboard/content/listContent.jsp?course_id=_<YOUR COURSE ID HERE>_1&content_id=_<YOUR CONTENT ID HERE>_1&mode=resetz<YOUR BBROUTER COOKIE HERE>T)r   zLoaded z pages of PDFs from )r0   r   pathlibr   typingr   r   r   r   urllib.parser   langchain_core.documentsr	   .langchain_community.document_loaders.directoryr
   (langchain_community.document_loaders.pdfr   -langchain_community.document_loaders.web_baser   r   r   rp   r;   r8   r.   lenr5    rR   r$   <module>r      sW       				       - - - - - - - - - - - -             - - - - - - J J J J J J @ @ @ @ @ @ G G G G G GT T T T T} T T Tn z	C 	&!  F I	E
ICC	NN
I
I
I
IJJJJJ rR   