
    Ng<                         d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	 d dl
mZ d dlmZ  ej        e          Zerd dlmZ  G d de          ZdS )    N)TYPE_CHECKINGAnyIteratorListOptionalTuple)Document)
BaseLoader)SparkSessionc            	           e Zd ZdZ	 	 	 	 dded         dee         ded	efd
Zde	e
e
f         fdZdee         fdZdee         fdZdS )PySparkDataFrameLoaderzLoad `PySpark` DataFrames.Ntext皙?spark_sessionr   dfpage_content_columnfraction_of_memoryc                    	 ddl m}m} n# t          $ r t          d          w xY w|r|n|j                                        | _        t          ||          st          dt          |                     || _
        || _        || _        |                                 \  | _        | _        | j
        j                            t$                    | _        | j
        j        | _        dS )ag  Initialize with a Spark DataFrame object.

        Args:
            spark_session: The SparkSession object.
            df: The Spark DataFrame object.
            page_content_column: The name of the column containing the page content.
             Defaults to "text".
            fraction_of_memory: The fraction of memory to use. Defaults to 0.1.
        r   )	DataFramer   zFpyspark is not installed. Please install it with `pip install pyspark`z3Expected data_frame to be a PySpark DataFrame, got N)pyspark.sqlr   r   ImportErrorbuildergetOrCreatespark
isinstance
ValueErrortyper   r   r   get_num_rowsnum_rowsmax_num_rowsrddmaplistrdd_dfcolumnscolumn_names)selfr   r   r   r   r   r   s          r/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/pyspark_dataframe.py__init__zPySparkDataFrameLoader.__init__   s    	;;;;;;;;; 	 	 	?  	 +RMM0D0P0P0R0R 	
 "i(( 	Pd2hhPP   #6 "4+/+<+<+>+>(t(gkood++ GOs    %returnc                    	 ddl }n"# t          $ r}t          d          |d}~ww xY w| j                            d                                          d         }t          j        |          }|                                }|j        }t          ||z  | j
        z            }t          || j                                                  |fS )z4Gets the number of "feasible" rows for the DataFramer   NzBpsutil not installed. Please install it with `pip install psutil`.   )psutilr   r   limitcollectsys	getsizeofvirtual_memory	availableintr   mincount)r'   r-   erowestimated_row_sizemem_infoavailable_memoryr    s           r(   r   z#PySparkDataFrameLoader.get_num_rows:   s    	MMMM 	 	 	T 	 gmmA&&((+ ]3//((**#- 22d6MM
 
 <11<??s    
&!&c              #      K    j                                         D ]e fdt          t                              D             }| j                 }|                     j                   t          ||          V  fdS )z#A lazy loader for document content.c                 8    i | ]}j         |         |         S  )r&   ).0ir8   r'   s     r(   
<dictcomp>z4PySparkDataFrameLoader.lazy_load.<locals>.<dictcomp>N   s'    NNN)!,c!fNNN    )page_contentmetadataN)r$   toLocalIteratorrangelenr   popr	   )r'   rD   r   r8   s   `  @r(   	lazy_loadz PySparkDataFrameLoader.lazy_loadK   s      ;..00 	A 	ACNNNNNeCHHooNNNHD45DLL1222x@@@@@@@		A 	ArB   c                 6   | j                                         | j        k    r=t                              d| j                                          d| j         d           |                                 }t          t          j	        || j                            S )zLoad from the dataframe.z The number of DataFrame rows is zQ, but we will only include the amount of rows that can reasonably fit in memory: .)
r   r6   r    loggerwarningr   rI   r#   	itertoolsislice)r'   lazy_load_iterators     r(   loadzPySparkDataFrameLoader.loadS   s    7==??T...NNO47==?? O O>BmO O O  
 "^^--I$%7GGHHHrB   )NNr   r   )__name__
__module____qualname____doc__r   r   strfloatr)   r   r4   r   r   r	   rI   r   rQ   r>   rB   r(   r   r      s        $$ 37 #)$'%, %,/%, SM%, !	%,
 "%, %, %, %,N@eCHo @ @ @ @"A8H- A A A A	Id8n 	I 	I 	I 	I 	I 	IrB   r   )rN   loggingr0   typingr   r   r   r   r   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   	getLogger__file__rL   r   r   r   r>   rB   r(   <module>r^      s         



 F F F F F F F F F F F F F F F F - - - - - - @ @ @ @ @ @		8	$	$ )((((((LI LI LI LI LIZ LI LI LI LI LIrB   