
    NgQ,                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ  ej        e          Z  G d d	e          Z! G d
 de          Z"dS )z:Pebblo's safe dataloader is a wrapper for document loaders    N)version)AnyDictIterableIteratorListOptional)Document)
BaseLoader)BATCH_SIZE_BYTESPLUGIN_VERSIONApp	FrameworkIndexedDocumentPebbloLoaderAPIWrappergenerate_size_based_batchesget_full_pathget_loader_full_pathget_loader_typeget_runtimeget_source_sizec                   F   e Zd ZU dZdZeed<   	 	 	 	 	 d dddded	ed
edede	e         dede	e         dedefdZ
dee         fdZd!dZdee         fdZed!d            ZdefdZdee         fdZdedee         fdZdee         fdZdededefdZdeddfdZdS )"PebbloSafeLoaderzkPebblo Safe Loader class is a wrapper around document loaders enabling the data
    to be scrutinized.
    F_discover_sent Nlocal)classifier_locationanonymize_snippetslangchain_loadernameownerdescriptionapi_keyload_semanticclassifier_urlr   r   c                   |rt          |t                    st          d          || _        t          t	          j                              | _        || _        t          j	        
                    d          p|| _        || _        || _        t          | j                  | _        g | _        g | _        t          t%          | j                                                d          d                             d          d         }
t)          |
          | _        t-          | j                  | _        t0          | _        |
| j        | j        d| j        dk    rdt          | j                  ini | _        |                                 | _        t;          ||||		          | _        | j                            | j                   d S )
NzMust specify a valid name.PEBBLO_LOAD_SEMANTIC.'r   )loadersource_pathsource_typesource_path_size)r#   r   r%   r   ) 
isinstancestr	NameErrorapp_nameuuiduuid4load_idr+   osenvirongetr$   r!   r"   r   r,   docsdocs_with_idtypesplitr   r-   r   r.   r   
batch_sizeloader_details_get_app_detailsappr   	pb_clientsend_loader_discover)selfr   r    r!   r"   r#   r$   r%   r   r   loader_names              g/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/pebblo.py__init__zPebbloSafeLoader.__init__%   s     	::dC00 	:89994:<<((&Z^^,BCCT}
&/<<$&	35$t{++,,22377;AA#FFqI*;77 /0@ A A*!++	
 	
 (1,, $S)>%?%?@@	
 ((**/ 3)1	
 
 
 	++DH55555    returnc                 t    | j                                         | _        |                                  | j        S )zxLoad Documents.

        Returns:
            list: Documents fetched from load method of the wrapped `loader`.
        )r+   loadr9   classify_in_batches)rC   s    rE   rJ   zPebbloSafeLoader.loadV   s3     K$$&&	  """yrG   c                    t          | j        | j                  }g }t          |          }t	          |          D ]\  }}||dz
  k    }|| _        |                                 | _        | j                            | j        | j	        | j
        |          }|                     |           | j        r|                     |          }n|                                 }|                    |           || _        dS )z
        Classify documents in batches.
        This is to avoid API timeouts when sending large number of documents.
        Batches are generated based on the page_content size.
           )loading_endN)r   r9   r=   len	enumerate_index_docsr:   rA   classify_documentsr@   r>   _add_pebblo_specific_metadatar$   _add_semantic_to_docs_unindex_docsextend)	rC   batchesprocessed_docstotal_batchesibatchis_last_batchclassified_docsbatch_processed_docss	            rE   rK   z$PebbloSafeLoader.classify_in_batchesa   s    )DIt)
 )
 *,G!'** 	8 	8HAu"#}q'8"8MDI $ 0 0 2 2D"n??!#)	 @  O ..???! <'+'A'A/'R'R$$'+'9'9';';$!!"67777"			rG   c              #     K   	 | j                                         }nP# t          $ rC}| j         j        j         d}t
                              |           t          |          |d}~ww xY w	 	 t          |          }n# t          $ r g | _	        Y dS w xY wt          |f          | _	        |                                 | _        | j                            | j        | j        | j                  }|                     |           | j        r|                     |          | _	        n|                                 | _	        | j	        d         V  )zLoad documents in lazy fashion.

        Raises:
            NotImplementedError: raised when lazy_load id not implemented
            within wrapped loader.

        Yields:
            list: Documents from loader's lazy loading.
        z does not implement lazy_load()NTr   )r+   	lazy_loadNotImplementedError	__class____name__loggererrornextStopIterationr9   listrQ   r:   rA   rR   r@   r>   rS   r$   rT   rU   )rC   doc_iteratorexcerr_strdocclassified_docs         rE   r`   zPebbloSafeLoader.lazy_load   sf     	8;0022LL" 	8 	8 	8.7XXXGLL!!!%g..C7	8	<((    	 cVDI $ 0 0 2 2D!^>>!48T-@ N ..~>>>! 1 66~FF		 ..00	)A,!	s&    
A+>A&&A+0B   BBc                     d| _         d S )NT)r   )clss    rE   set_discover_sentz"PebbloSafeLoader.set_discover_sent   s    !rG   c                     t                      \  }}t          | j        | j        | j        | j        ||t          t          dt          d                              }|S )z\Fetch app details. Internal method.

        Returns:
            App: App details.
        langchain_community)r    r   )r    r!   r"   r5   runtime	frameworkplugin_versionclient_version)	r   r   r2   r!   r"   r5   r   r   r   )rC   rt   rs   r@   s       rE   r?   z!PebbloSafeLoader._get_app_details   sl     )]]	7*(L)$* 566  
 
 
 
rG   c                 B    d t          | j                  D             }|S )z
        Indexes the documents and returns a list of IndexedDocument objects.

        Returns:
            List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
        c           	      n    g | ]2\  }}t          dd t          |          i|                                3S )pb_id )r   r0   dict.0rZ   rl   s      rE   
<listcomp>z0PebbloSafeLoader._index_docs.<locals>.<listcomp>   sM     
 
 
3 77#a&&7CHHJJ77
 
 
rG   )rP   r9   )rC   r:   s     rE   rQ   zPebbloSafeLoader._index_docs   s2    
 
#DI..
 
 
 rG   r]   c                     d | j         D             }|                                D ]7}|                    d          }||v r|                     ||         |           8d |                                D             }|S )aF  
        Adds semantic metadata to the given list of documents.

        Args:
            classified_docs (Dict): A dictionary of dictionaries containing the
                classified documents with pb_id as key.

        Returns:
            List[Document]: A list of Document objects with added semantic metadata.
        c                 P    i | ]#}|j         t          |j        |j                   $S )page_contentmetadata)ry   r
   r   r   r}   rl   s     rE   
<dictcomp>z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<dictcomp>   s>     
 
 
 IxS-=UUU
 
 
rG   ry   c                     g | ]}|S rz   rz   r   s     rE   r~   z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<listcomp>   s    !G!G!G##!G!G!GrG   )r:   valuesr8   _add_semantic_to_doc)rC   r]   indexed_docsrm   doc_idsemantic_metadata_docss         rE   rT   z&PebbloSafeLoader._add_semantic_to_docs   s    
 
(
 
 

 .4466 	P 	PN#''00F%%)),v*>OOO!G!G1D1D1F1F!G!G!G%%rG   c                 B    d t          | j                  D             }|S )z
        Converts a list of IndexedDocument objects to a list of Document objects.

        Returns:
            List[Document]: A list of Document objects.
        c                 J    g | ] \  }}t          |j        |j                   !S r   )r
   r   r   r|   s      rE   r~   z2PebbloSafeLoader._unindex_docs.<locals>.<listcomp>   s>     
 
 
3 #"2S\JJJ
 
 
rG   )rP   r:   )rC   r9   s     rE   rU   zPebbloSafeLoader._unindex_docs   s2    
 
#D$566
 
 
 rG   rl   rm   c                     t          |                    di                                                     |j        d<   t          |                    di                                                     |j        d<   |S )a4  
        Adds semantic metadata to the given document in-place.

        Args:
            doc (Document): A Document object.
            classified_doc (dict): A dictionary containing the classified document.

        Returns:
            Document: The Document object with added semantic metadata.
        entitiespebblo_semantic_entitiestopicspebblo_semantic_topics)rh   r8   keysr   )rC   rl   rm   s      rE   r   z%PebbloSafeLoader._add_semantic_to_doc   sy     48z2..33554
 4
/0 26x,,11332
 2
-. 
rG   c           
         | j         D ]}|j        }| j        j        j        dk    r,t          |                    d| j                            |d<   n?t          |                    d|                    d| j                                      |d<   |                    |j        i                               dd          |d<   dS )z*Add Pebblo specific metadata to documents.SharePointLoadersource	full_pathpb_checksumN)	r:   r   r+   rb   rc   r   r8   r,   ry   )rC   r]   rl   doc_metadatas       rE   rS   z.PebbloSafeLoader._add_pebblo_specific_metadata  s    $ 	 	C<L{$-1CCC,9 $$Xt/?@@- -[)) -: $$#\%5%5h@P%Q%Q - -[)
 +:*=*=ci*L*L*P*Pt+ +L''	 	rG   )r   r   NFN)rH   N)rc   
__module____qualname____doc__r   bool__annotations__r   r0   r	   rF   r   r
   rJ   rK   r   r`   classmethodrp   r   r?   r   rQ   r   rT   rU   r{   r   rS   rz   rG   rE   r   r      s          !ND    !%#(,/6 $+#(/6 /6 /6$/6 /6 	/6
 /6 #/6 /6 !/6 !/6 !/6 /6 /6 /6b	d8n 	 	 	 	# # # #@ 8H-        D " " " ["#    ,T/2    &T &d8n & & & &2tH~     $ 8    &T d      rG   r   c                       e Zd ZdZddddddee         dee         deee                  deeee	f                  deeeee	f                           d	dfd
Z
d	ee         fdZd	ee         fdZdS )PebbloTextLoaderz
    Loader for text data.

    Since PebbloSafeLoader is a wrapper around document loaders, this loader is
    used to load text data directly into Documents.
    N)r   idsr   	metadatastextsr   r   r   r   rH   c                L    || _         || _        || _        || _        || _        dS )a  
        Args:
            texts: Iterable of text data.
            source: Source of the text data.
                Optional. Defaults to None.
            ids: List of unique identifiers for each text.
                Optional. Defaults to None.
            metadata: Metadata for all texts.
                Optional. Defaults to None.
            metadatas: List of metadata for each text.
                Optional. Defaults to None.
        N)r   r   r   r   r   )rC   r   r   r   r   r   s         rE   rF   zPebbloTextLoader.__init__  s+    * 
 "rG   c              #   l  K   t          | j                  D ]\  }}d}| j        pi }| j        rE|t	          | j                  k     r-| j        |         r |                    | j        |                    | j        r%|t	          | j                  k     r| j        |         }t          |||          V  dS )zi
        Lazy load text data into Documents.

        Returns:
            Iterator of Documents
        N)idr   r   )rP   r   r   r   rO   updater   r
   )rC   rZ   text_idr   s        rE   r`   zPebbloTextLoader.lazy_load9  s       !,, 	I 	IGAtC}*H~ 3!c$.&9&9"9"9dnQ>O"9q 1222x "ADH--hqkcxHHHHHHH	I 	IrG   c                 b    g }|                                  D ]}|                    |           |S )z`
        Load text data into Documents.

        Returns:
            List of Documents
        )r`   append)rC   	documentsrl   s      rE   rJ   zPebbloTextLoader.loadI  s?     	>>## 	" 	"CS!!!!rG   )rc   r   r   r   r   r0   r	   r   r   r   rF   r   r
   r`   rJ   rz   rG   rE   r   r     s          !%#'-148# # #}# 	#
 d3i # 4S>*# Dc3h01# 
# # # #6I8H- I I I I 
d8n 
 
 
 
 
 
rG   r   )#r   loggingr6   r3   importlib.metadatar   typingr   r   r   r   r   r	   langchain_core.documentsr
   )langchain_community.document_loaders.baser   $langchain_community.utilities.pebblor   r   r   r   r   r   r   r   r   r   r   r   	getLoggerrc   rd   r   r   rz   rG   rE   <module>r      s   @ @  				  & & & & & & @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ @ - - - - - - @ @ @ @ @ @                            
	8	$	$u u u u uz u u up= = = = =z = = = = =rG   