
    Ngc                    4   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ  ej         e!          Z"dZ#dZ$dZ%dZ&g dZ'g dZ(dgZ)g dZ*e'e(e)e*dZ+ G d de,e          Z- G d de          Z. G d de          Z/ G d de          Z0 G d de          Z1 G d  d!e          Z2d;d%Z3d<d'Z4d=d)Z5d>d+Z6d?d,Z7	 d@dAd2Z8dBd4Z9dCd6Z:dDd8Z; G d9 d:e          Z<dS )E    )annotationsN)Enum)
HTTPStatus)AnyDictListOptionalTuple)Document)get_runtime_environment)get_from_dict_or_env)	BaseModel)Responserequest)RequestException)
BaseLoaderz0.1.1zhttp://localhost:8000zhttps://api.daxa.ai  )
JSONLoaderS3FileLoaderUnstructuredMarkdownLoaderUnstructuredPDFLoaderUnstructuredFileLoaderUnstructuredJsonLoaderPyPDFLoaderGCSFileLoaderAmazonTextractPDFLoader	CSVLoaderUnstructuredExcelLoaderUnstructuredEmailLoader)DirectoryLoaderS3DirLoaderSlackDirectoryLoaderPyPDFDirectoryLoaderNotionDirectoryLoaderDataFrameLoader)NotionDBLoaderGoogleDriveLoaderSharePointLoader)filedir	in-memoryzcloud-folderc                      e Zd ZdZdZdZdS )Routesz2Routes available for the Pebblo API as enumerator.z/v1/loader/docz/v1/app/discoverN)__name__
__module____qualname____doc__
loader_docloader_app_discover     `/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/utilities/pebblo.pyr-   r-   C   s!        <<!J,r5   r-   c                      e Zd ZU dZded<   dS )IndexedDocumentzPebblo Indexed Document.strpb_idNr.   r/   r0   r1   __annotations__r4   r5   r6   r8   r8   J   s!         ""JJJ$$r5   r8   c                      e Zd ZU dZdZded<   	 ded<   	 ded<   	 dZded	<   	 ded
<   	 ded<   	 ded<   	 ded<   	 ded<   	 dZded<   dS )RuntimezPebblo Runtime.localr9   typehostpath Optional[str]ipplatformos
os_versionlanguagelanguage_versionruntimeN)r.   r/   r0   r1   r@   r<   rE   rK   r4   r5   r6   r>   r>   Q   s         D,III#III)B4MMM*GGGOOOMMM(G44r5   r>   c                  *    e Zd ZU dZded<   	 ded<   dS )	FrameworkzPebblo Framework instance.r9   nameversionNr;   r4   r5   r6   rM   rM   j   s-         $$III LLL##r5   rM   c                  r    e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   	 ded	<   	 d
ed<   	 ded<   	 d
ed<   dS )AppzPebblo AI application.r9   rN   ownerrD   descriptionload_idr>   rK   rM   	frameworkplugin_versionclient_versionNr;   r4   r5   r6   rQ   rQ   s   s           IIIJJJ!LLL-%'***r5   rQ   c                      e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   	 ded<   	 d	ed
<   	 ded<   	 ded<   	 ded<   	 ded<   dS )DoczPebblo document.r9   rN   rR   listdocsrV   rT   dictloader_detailsboolloading_endsource_ownerclassifier_locationanonymize_snippetsNr;   r4   r5   r6   rY   rY      s         III0JJJJJJ.LLL-+7,%UUr5   rY   rB   r9   returnc                    | rd| v sd| d         k    s| dv r| S t          j        |           }|                                r|                                }t	          |          S )zReturn an absolute local path for a local file/directory,
    for a network related path, return as is.

    Args:
        path (str): Relative path to be resolved.

    Returns:
        str: Resolved absolute path.
    z:///r   )unknown-r+   )pathlibPathexistsresolver9   )rB   	full_paths     r6   get_full_pathrm      ss     TMM47NN111T""I (%%''	y>>r5   loaderc                T    t                                           D ]\  }}| |v r|c S dS )zReturn loader type among, file, dir or in-memory.

    Args:
        loader (str): Name of the loader, whose type is to be resolved.

    Returns:
        str: One of the loader type among, file/dir/in-memory.
    unsupported)LOADER_TYPE_MAPPINGitems)rn   loader_typeloaderss      r6   get_loader_typeru      sE     !4 9 9 ; ;  WW =r5   r   c                   ddl m}m}m}m} d}t          | t                    st                              d           |S | j	        }	 d|v rHt          | |          rd| j
         d| j         }nt          | |          rd| j
         d| j         }nd	|v r!|d	         }|rd
|v r|d
         }|r| d| }n_d|v r
|d         }nQd|v r
|d         }nCd|v r<|d         }|r0t          |t                    rt          |          dk    r|d         }nt          | |          rd}nt          | |          rd| j         }n| j        j        dk    r|                    d          r|                    d          }	d|	 }n|                    d          r6|                    dg           }
d                    d |
D                       }nJ|                    d          r5|                    dg           }d                    d |D                       }n# t(          $ r Y nw xY wt+          t-          |                    S )zReturn an absolute source path of source of loader based on the
    keys present in Document.

    Args:
        loader (BaseLoader): Langchain document loader, derived from Baseloader.
    r   )r%   r   r&   r   rg   zGloader is not derived from BaseLoader, source location will be unknown!bucketzgc://re   zs3://sourcechannelrB   	file_path	web_pathsr+   znotiondb://r'   	folder_idz+https://drive.google.com/drive/u/2/folders/file_idsz, c                    g | ]}d | d	S )z https://drive.google.com/file/d/z/viewr4   ).0file_ids     r6   
<listcomp>z(get_loader_full_path.<locals>.<listcomp>   s4       # J7III  r5   document_idsc                    g | ]}d | d	S )z#https://docs.google.com/document/d/z/editr4   )r   doc_ids     r6   r   z(get_loader_full_path.<locals>.<listcomp>  s4       " LfKKK  r5   )$langchain_community.document_loadersr%   r   r&   r   
isinstancer   loggererror__dict__rw   blobkeyrZ   lendatabase_id	__class__r.   getjoin	Exceptionrm   r9   )rn   r%   r   r&   r   locationloader_dictry   r{   r|   r}   r   s               r6   get_loader_full_pathr      s               Hfj)) U	
 	
 	
 /K/{""&-00 @@6=@@6;@@FL11 @?6=??6:??$$"8,H 7I44%i0 7"*66W66H{"""6*HHK''";/HHK''#K0I (Z	488 (S^^a=O=O$Q<00 	"HH// 	9V%799HH&*==={++ 'OOK88	TTT,, &??:r::99 '/    00 *~rBB99 &2       X'''s   GH 
H&%H&Tuple[Framework, Runtime]c                 4   t                      } t          d|                     dd                    }t          j                    }t          |j        t          j        d         |                     dd          |j	        |j
        t                      |                     dd          |                     d	d          
          }d|j        v rd|_        d|_        t                              d|            t                              d|            ||fS )zFetch the current Framework and Runtime details.

    Returns:
        Tuple[Framework, Runtime]: Framework and Runtime for the current app instance.
    	langchainlibrary_versionN)rN   rO   PWDrF   rf   rK   runtime_version)rA   rB   rF   rG   rH   rE   rI   rJ   DarwindesktopzMac OSXz
framework zruntime )r   rM   r   rF   unamer>   noderG   environsystemrO   get_ipr@   rK   r   debug)runtime_envrU   r   rK   s       r6   get_runtimer     s    *++K+//2CT"J"J  I NEZZY77<=88I66$):IFF	 	 	G 7: #
LL)i))***
LL%G%%&&&gr5   c                     ddl } |                                 }	 |                     |          }n%# t          $ r |                     d          }Y nw xY w|S )zJFetch local runtime ip address.

    Returns:
        str: IP address
    r   N	localhost)socketgethostnamegethostbynamer   )r   rA   	public_ips      r6   r   r   .  sp     MMMD6((..		 6 6 6((55			6s   0 AAr[   List[Document]max_batch_sizeintList[List[Document]]c                J   g }g }d}| D ]}t          |j                            d                    }||k    r|                    |g           F||z   |k    r|                    |           g }d}|                    |           ||z  }|r|                    |           |S )a  
    Generate batches of documents based on page_content size.
    Args:
        docs: List of documents to be batched.
        max_batch_size: Maximum size of each batch in bytes. Defaults to 100*1024(100KB)
    Returns:
        List[List[Document]]: List of batches of documents
    r   utf-8)r   page_contentencodeappend)r[   r   batchescurrent_batchcurrent_batch_sizedocdoc_sizes          r6   generate_size_based_batchesr   >  s     %'G$&M + +C,33G<<==n$$NNC5!!!!!H,~==}--- "%&"   %%%(*  &}%%%Nr5   rz   c                    	 ddl }t          j        |           j        }|                    |          j        }n# t          $ r d}Y nw xY w|S )zFetch owner of local file path.

    Args:
        file_path (str): Local file path.

    Returns:
        str: Name of owner.
    r   Nrf   )pwdrG   statst_uidgetpwuidpw_namer   )rz   r   file_owner_uidfile_owner_names       r6   get_file_owner_from_pathr   f  sb    $


++2,,~66> $ $ $#$s   7: A	A	source_pathc                   | sdS d}t           j                            |           r t           j                            |           }nt           j                            |           rd}t          j        |           D ]l\  }}}|D ]c}t           j                            ||          }t           j                            |          s"|t           j                            |          z  }dm|}|S )zFetch size of source path. Source can be a directory or a file.

    Args:
        source_path (str): Local path of data source.

    Returns:
        int: Source size in bytes.
    r   )rG   rB   isfilegetsizeisdirwalkr   islink)r   size
total_sizedirpath_	filenamesffps           r6   get_source_sizer   y  s      qD	w~~k"" 	w{++	{	#	# 
%'W[%9%9 	6 	6!GQ	 6 6W\\'1--w~~b)) 6"'//""5"55J6 Kr5   datac                N    |                      d          }t          |          }|S )zCalculate the content size in bytes:
    - Encode the string to bytes using a specific encoding (e.g., UTF-8)
    - Get the length of the encoded bytes.

    Args:
        data (str): Data string.

    Returns:
        int: Size of string in bytes.
    r   )r   r   )r   encoded_contentr   s      r6   calculate_content_sizer     s'     kk'**ODKr5   c                       e Zd ZU dZded<   	 dZded<   	 ded<   	 ded<   	 d	Zd
ed<   	 d2 fdZd3dZ	 d4d5dZ	d6dZ
d4d7dZd8d#Ze	 	 d9d:d,            Zed;d/            Zed<d1            Z xZS )=PebbloLoaderAPIWrapperzWrapper for Pebblo Loader API.rD   api_keyr?   r9   ra   classifier_url	cloud_urlFr^   rb   kwargsr   c                    t          |ddd          |d<   t          |ddt                    |d<   t          |ddt                    |d<    t                      j        d	i | dS )
z%Validate that api key in environment.r   PEBBLO_API_KEYrC   r   PEBBLO_CLASSIFIER_URLr   PEBBLO_CLOUD_URLNr4   )r   _DEFAULT_CLASSIFIER_URL_DEFAULT_PEBBLO_CLOUD_URLsuper__init__)selfr   r   s     r6   r   zPebbloLoaderAPIWrapper.__init__  s    0I/
 
y $8$&=?V$
 $
  3K!35N
 
{ 	""6"""""r5   apprQ   rc   Nonec                <   d}|                     d          }| j        dk    rF|                                 }| j         t          j        j         }|                     d|||          }| j        r|                     d          }|rCt          j
        |j                                      d          }|                    d|i           |                    dt          i           | j         t          j        j         }|                     d|||          }dS dS )	z
        Send app discovery request to Pebblo server & cloud.

        Args:
            app (App): App instance to be discovered.
        NTexclude_unsetr?   POSTcloud_requestpebblo_server_versionpebblo_client_version)r\   ra   _make_headersr   r-   r3   valuemake_requestr   jsonloadstextr   updatePLUGIN_VERSIONr   )	r   r   pebblo_resppayloadheadersapp_discover_urlr   pebblo_cloud_urlr   s	            r6   send_loader_discoverz+PebbloLoaderAPIWrapper.send_loader_discover  s9    (((..#w..((**G&J(B(HJJ  ++F4DgwWWK< 	N((t(<<G Q(,
;3C(D(D(H(H+) )%  79NOPPPNN3^DEEE"&.T&2L2RTT!!&*:GWMMAAA	N 	Nr5   docs_with_idList[IndexedDocument]r]   r\   r_   c                   |                     dd          }t          |          }|                     |||          \  }}|                     ||||||          }	i }
| j        dk    r|                                 }| j         t          j        j	         }	 | 
                    d|||	d          }|rMt          j        |j                                       dg           D ]}|
                    |d         |i            n2# t          $ r%}t                               d|           Y d	}~nd	}~ww xY w| j        rS| j        dk    r|                     |	d         |
           |	                    d
d	           |                     |	           n4| j        dk    r)t                               d           t-          d          |
S )a  
        Send documents to Pebblo server for classification.
        Then send classified documents to Daxa cloud(If api_key is present).

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            app (App): App instance.
            loader_details (dict): Loader details.
            loading_end (bool): Boolean, indicating the halt of data loading by loader.
        r   rC   r?   r   i,  r[   r:   z3An Exception caught in classify_documents: local %sNrb   zpebblo-cloudz4API key is missing for sending docs to Pebblo cloud.)r   r   prepare_docs_for_classificationbuild_classification_payloadra   r   r   r-   r2   r   r   r   r   r   r   r   r   warningr   update_doc_datapopsend_docs_to_pebblo_cloud	NameError)r   r   r   r]   r_   r   r`   r[   source_aggregate_sizer   classified_docsr   load_doc_urlr   classified_doces                   r6   classify_documentsz)PebbloLoaderAPIWrapper.classify_documents  s   " %((;;/<<&*&J&J+~'
 '
## 33~|5JK
 
 #w..((**G"1L63D3JLLLY"//L'7C   *.*[5E*F*F*J*J6SU*V*V  '..+G4nE     Y Y YTVWXXXXXXXXY < 	T'722 $$WV_oFFFKK,d333**73333%77NNQRRRRSSSs   A(C? ?
D.	D))D.r   c                    |                      d          }| j         t          j        j         }	 |                     d|||          }dS # t          $ r&}t                              d|           Y d}~dS d}~ww xY w)z
        Send documents to Pebblo cloud.

        Args:
            payload (dict): The payload containing documents to be sent.
        Tr   r   z3An Exception caught in classify_documents: cloud %sN)	r   r   r-   r2   r   r   r   r   r  )r   r   r   r   r   r  s         r6   r  z0PebbloLoaderAPIWrapper.send_docs_to_pebblo_cloud  s     $$4$88"nGf.?.EGG	U!!&*:GWMMAAA 	U 	U 	UNNPRSTTTTTTTTT	Us   A 
A<A77A<r   c                    ddd}|r>| j         r|                    d| j         i           nt                              d           |S )z
        Generate headers for the request.

        args:
            cloud_request (bool): flag indicating whether the request is for Pebblo
            cloud.
        returns:
            dict: Headers for the request.

        zapplication/json)AcceptzContent-Typez	x-api-keyz,API key is missing for Pebblo cloud request.)r   r   r   r  )r   r   r   s      r6   r   z$PebbloLoaderAPIWrapper._make_headers(  s`     ).
 
  	O| OT\:;;;;MNNNr5   r[   
List[dict]r`   r  r   c                    |j         |j        |t          |j        |d|| j        | j        d
}|du rd|d<   d|v r||d         d<   t          d	i |                    d          }|S )
a  
        Build the payload for document classification.

        Args:
            app (App): App instance.
            docs (List[dict]): List of documents to be classified.
            loader_details (dict): Loader details.
            source_owner (str): Owner of the source.
            source_aggregate_size (int): Aggregate size of the source.
            loading_end (bool): Boolean indicating the halt of data loading by loader.

        Returns:
            dict: Payload for document classification.
        false)
rN   rR   r[   rV   rT   r]   r_   r`   ra   rb   Ttruer_   r]   r  r   r4   )rN   rR   r   rT   ra   rb   rY   r\   )r   r   r[   r]   r`   r  r_   r   s           r6   r  z3PebbloLoaderAPIWrapper.build_classification_payload?  s    0 HY,{,"(#'#;"&"9#
 #
 $%+GM"7**) ()*AB ....%%D%99r5   N   methodurlr   Optional[dict]timeoutOptional[Response]c           
     
   	 t          | ||||          }t                              d| |j         j        t	          t          |j         j        r|j         j        ng                     t	          |j                             |j        t          j	        k    r#t          
                    d|j                    no|j        t          j        k    r#t          
                    d|j                    n7|j        t          j        k    r"t          
                    d|j                    |S # t          $ r t          
                    d|           Y n1t          $ r%}t          
                    d|           Y d}~nd}~ww xY wdS )	a  
        Make a request to the Pebblo API

        Args:
            method (str): HTTP method (GET, POST, PUT, DELETE, etc.).
            url (str): URL for the request.
            headers (dict): Headers for the request.
            payload (Optional[dict]): Payload for the request (for POST, PUT, etc.).
            timeout (int): Timeout for the request in seconds.

        Returns:
            Optional[Response]: Response object if the request is successful.
        )r  r  r   r   r  z5Request: method %s, url %s, len %s response status %szPebblo Server: Error z$Pebblo received an invalid payload: z-Pebblo returned an unexpected response code: zUnable to reach server %sz'An Exception caught in make_request: %sN)r   r   r   r  r9   r   bodystatus_coder   INTERNAL_SERVER_ERRORr  BAD_REQUESTr   OKr   r   )r  r  r   r   r  responser  s          r6   r   z#PebbloLoaderAPIWrapper.make_requestk  s   *	I3gw  H LLG $C1A1FN(--BOOPPH())   #z'GGGMx7KMMNNNN%)???UhmUUVVVV%66.+. .  
 O 	= 	= 	=NN6<<<<< 	I 	I 	INNDaHHHHHHHH	Its   D(D+ +%F 	F E;;F r   Tuple[List[dict], int]c           
        g }d}d | D             }d}|D ]}|                     di           }|                     dg           }	|d         dk    r*t          |                     d|d	                             }
n7t          |                     d
|                     d|                              }
|                     dt          |
                    }|                     dt          |
                    }t	          |                     d                    }t          |          }||z  }|                     dd          pd}|                    ||
||                     di                                d          |d|	rd|	ini |d|ini            |d         dk    r|s|                     d          |d	<   d}||fS )a  
        Prepare documents for classification.

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            source_path (str): Source path of the documents.
            loader_details (dict): Contains loader info.

        Returns:
            Tuple[List[dict], int]: Documents and the aggregate size
            of the source.
        r   c                6    g | ]}|                                 S r4   )r\   )r   r   s     r6   r   zJPebbloLoaderAPIWrapper.prepare_docs_for_classification.<locals>.<listcomp>  s     :::csxxzz:::r5   Fmetadataauthorized_identitiesrn   r(   rx   r   rl   rR   r   r   r:   Nlast_modified)r   r   r:   r&  
file_ownersource_path_sizesource_full_urlT)r   rm   r   r   r9   r   r   )r   r   r]   r[   r  doc_contentsource_path_updater   doc_metadatadoc_authorized_identitiesdoc_source_pathdoc_source_ownerdoc_source_sizer   page_content_sizer   s                   r6   r   z6PebbloLoaderAPIWrapper.prepare_docs_for_classification  s5   $  !::\:::" .	* .	*C77:r22L(4(8(89PRT(U(U%h'+==="/ $$X~m/LMM# # #0 $$#$((;?? # #  ,//1/BB    +..v7W7WXXOsww~6677L 6| D D!%66!WWWd++0qFKK'#2#%(WWZ%<%<%@%@%Q%Q"2  5 02KLL +6 ,_==  ( x(,>>>* ? 1=0@0@AR0S0S}-%)"***r5   r  c           
     B   | D ]}|                     |d         i           }|                    |                     d          |                     d          |                     di           |                     di           d           |                    d           dS )	z
        Update the document data with classified information.

        Args:
            docs (List[dict]): List of document data to be updated.
            classified_docs (dict): The dictionary containing classified documents.
        r:   pb_checksumloader_source_pathentitiestopics)r3  r4  r5  r6  r   N)r   r   r  )r[   r  doc_dataclassified_datas       r6   r  z&PebbloLoaderAPIWrapper.update_doc_data  s      	  	 H-11(72CRHHOOO#2#6#6}#E#E*9*=*=>R*S*S / 3 3J C C-11(B??	    LL	  	 r5   )r   r   )r   rQ   rc   r   )F)
r   r   r   rQ   r]   r\   r_   r^   rc   r\   )r   r\   rc   r   )r   r^   rc   r\   )r   rQ   r[   r  r]   r\   r`   r9   r  r   r_   r^   rc   r\   )Nr  )r  r9   r  r9   r   r\   r   r  r  r   rc   r  )r   r   r   r9   r]   r\   rc   r!  )r[   r  r  r\   rc   r   )r.   r/   r0   r1   r<   ra   rb   r   r   r  r  r   r  staticmethodr   r   r  __classcell__)r   s   @r6   r   r     s        (("&&&&&I!!!!&!$$$$$U# # # # # #N N N NH "< < < < <|U U U U    .* * * *X 
 #'/ / / / \/b D+ D+ D+ \D+L       \         r5   r   )rB   r9   rc   r9   )rn   r9   rc   r9   )rn   r   rc   r9   )rc   r   )rc   r9   )r   )r[   r   r   r   rc   r   )rz   r9   rc   r9   )r   r9   rc   r   )r   r9   rc   r   )=
__future__r   r   loggingrG   rh   rF   enumr   httpr   typingr   r   r   r	   r
   langchain_core.documentsr   langchain_core.envr   langchain_core.utilsr   pydanticr   requestsr   r   requests.exceptionsr   )langchain_community.document_loaders.baser   	getLoggerr.   r   r   r   r   BATCH_SIZE_BYTESfile_loader
dir_loader	in_memorycloud_folderrq   r9   r-   r8   r>   rM   rQ   rY   rm   ru   r   r   r   r   r   r   r   r   r4   r5   r6   <module>rM     s   " " " " " "   				               3 3 3 3 3 3 3 3 3 3 3 3 3 3 - - - - - - 6 6 6 6 6 6 5 5 5 5 5 5       & & & & & & & & 0 0 0 0 0 0 @ @ @ @ @ @		8	$	$1 1      
 	    	  - - - - -S$ - - -% % % % %h % % %5 5 5 5 5i 5 5 52$ $ $ $ $	 $ $ $+ + + + +) + + +*V V V V V) V V V2   .   E( E( E( E(P   >   " 1;% % % % %P   &   2    W  W  W  W  W Y W  W  W  W  W r5   