
    NgQ                         d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ  G d dee          Z G d d	e          Z G d
 de          ZdS )    N)ABCabstractmethod)DictIteratorOptionalTupleUnion)Document)
BaseLoaderc            !          e Zd ZdZdddddddd	d	dddd
d
ddedededeeef         dededededededeeef         deeef         deeef         dee         dee         dd
f dZ	de
e         fdZedefd            Zdedefd Zd!ed"ede
e         fd#Z	 d(d!eded$ee         de
e         fd%Zd&edeeef         fd'Zd
S ))DedocBaseLoadera  
    Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).

    Loader enables extracting text, tables and attached files from the given file:
        * `Text` can be split by pages, `dedoc` tree nodes, textual lines
            (according to the `split` parameter).
        * `Attached files` (when with_attachments=True)
            are split according to the `split` parameter.
            For attachments, langchain Document object has an additional metadata field
            `type`="attachment".
        * `Tables` (when with_tables=True) are not split - each table corresponds to one
            langchain Document object.
            For tables, Document object has additional metadata fields `type`="table"
            and `text_as_html` with table HTML representation.
    documentTF
   
auto_tabbyrus+eng:autoN)splitwith_tableswith_attachmentsrecursion_deep_attachmentspdf_with_text_layerlanguagepagesis_one_column_documentdocument_orientationneed_header_footer_analysisneed_binarizationneed_pdf_table_analysis	delimiterencoding	file_pathr   r   r   r   r   r   r   r   r   r   r   r   r    r!   returnc                .   d t                                                      D             | _        h d| _        || j        vrt	          d| d| j         d          || _        || _        || _        | j        dk    rdnd}|| j        d	<   || j        d
<   dS )a
  
        Initialize with file path and parsing parameters.

        Args:
            file_path: path to the file for processing
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document text is returned as a single langchain Document
                    object (don't split)
                "page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
                    ODP)
                "node": split document text into tree nodes (title nodes, list item
                    nodes, raw text nodes)
                "line": split document text into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        c                 "    i | ]\  }}|d v	||S )>   selfr   r"   r    ).0keyvalues      f/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/dedoc.py
<dictcomp>z,DedocBaseLoader.__init__.<locals>.<dictcomp>d   s4     #
 #
 #
UGGG GGG    >   linenodepager   Got $ for `split`, but should be one of ``r/   treelinearstructure_typeneed_content_analysisN)localsitemsparsing_parametersvalid_split_values
ValueErrorr   r   r"   )r&   r"   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r6   s                    r+   __init__zDedocBaseLoader.__init__#   s    B#
 #
$hhnn..#
 #
 #

 #G"F"F////u / /+/ / /   
&"#':#7#7X4B 01;K 7888r-   c              #     K   ddl }	 ddlm} n# t          $ r t          d          w xY w ||                                           }d|j        d         _        |                                5 }|                    | j	        i | j
        d|i	          }ddd           n# 1 swxY w Y   |                     |                                                                | j        
          E d{V  dS )Lazily load documents.r   N)DedocManagerzE`dedoc` package not found, please install it with `pip install dedoc`)manager_configTloggerattachments_dir)r"   
parametersdocument_treer   )tempfilededocr@   ImportError_make_configconfigdisabledTemporaryDirectoryparser"   r:   _split_documentto_api_schemadictr   )r&   rG   r@   dedoc_managertmpdirrF   s         r+   	lazy_loadzDedocBaseLoader.lazy_loadw   s     	******* 	 	 	W  	 %D4E4E4G4GHHH26X&/((** 	f)//.Qd5Q7H&QQ 0  M	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 '''5577<<>>dj ( 
 
 	
 	
 	
 	
 	
 	
 	
 	
 	
s    )0(B$$B(+B(c                     dS )zu
        Make configuration for DedocManager according to the file extension and
        parsing parameters.
        Nr'   r&   s    r+   rJ   zDedocBaseLoader._make_config   s	     	r-   	paragraphc                      d                      fd|d         D                       }|r|d          d| n|d         }|S )z1Get text (recursively) of the document tree node.
c                 :    g | ]}                     |          S r'   )	_json2txt)r(   subparagraphr&   s     r+   
<listcomp>z-DedocBaseLoader._json2txt.<locals>.<listcomp>   s5         |,,  r-   subparagraphstext)join)r&   rW   subparagraphs_textr_   s   `   r+   r[   zDedocBaseLoader._json2txt   sy    !YY   $-o$>  
 
 "#y 88$68886" 	
 r-   rF   document_metadatac              #      K   t          |d                   dk    r*|d         D ]}|                     ||          E d{V   dS t          |d         i ||d                   V  dS )z4Parse recursively document tree obtained by `dedoc`.r^   r   rF   rb   Nr_   metadatapage_contentre   )len_parse_subparagraphsr
   )r&   rF   rb   r\   s       r+   ri   z$DedocBaseLoader._parse_subparagraphs   s       }_-..22 -o >  44".BS 5           
 *62K-Kz1JK       r-   additional_metadatac              #   F  K   |d         }|ri ||}|dk    r7|                      |d         d                   }t          ||          V  nQ|dk    r|d         d         d         }|d	         d         d
         }d}|D ]i}	|	d         d
         |k    r||                      |	          z  }-t          |i |d
|i          V  |	d         d
         }|                      |	          }jt          |i |d
|i          V  n|dk    rJ|d         d         d         D ]4}	|	d         }
t          |                      |	          i ||
          V  5nK|dk    r*|                     |d         d         |          E d{V  nt          d| d| j         d          | j        rI|d         d         D ]:}|                     |          \  }}t          |i |d         d|d          V  ;|d         D ]'}|                     || j        ddi          E d{V  (dS )z=Split document into parts according to the `split` parameter.re   r   content	structure)rW   rf   r0   r^   r   page_id r.   r/   rd   Nr1   r2   r3   tablestable)typetext_as_htmlattachmentsrr   
attachment)rF   r   rj   )	r[   r
   ri   r<   r;   r   
_get_tablerO   r   )r&   rF   r   rj   rb   r_   nodesrn   	page_textr/   line_metadatarq   
table_text
table_htmlru   s                  r+   rO   zDedocBaseLoader._split_document   s      **5 	M L#4 L8K LJ>>M),D[,Q>RRD7HIIIIIIIf__!),[9/JEAhz*95GI 	5 	5
#I.'99!5!55II"%.!J$5!Jy'!J!J      #:.y9G $t 4 4II&B-By'BB      
 f__%i0=oN   $Z 0!%!5!5C 1C]C       f__00+I6{C"3 1           /u / /+/ / /  
  
	&y1(; 	 	)-)?)?&
J!+
+ '(2         (6 	 	J++(j%+\$: ,          	 	r-   rq   c           
         d}|d         D ]9}|D ]/}|d                     d |d         D                       z  }|dz  }0|dz  }:d}|d         D ]w}|d	z  }|D ]h}d                     d
 |d         D                       }t          j        |          }|dz  }|d         r|dz  }|d|d          d|d          d| dz  }i|dz  }x|dz  }||fS )z.Get text and HTML representation of the table.ro   cells c              3   &   K   | ]}|d          V  dS r_   Nr'   r(   r.   s     r+   	<genexpr>z-DedocBaseLoader._get_table.<locals>.<genexpr>  s&      &N&NtF|&N&N&N&N&N&Nr-   lines	rY   zK<table border="1" style="border-collapse: collapse; width: 100%;">
<tbody>
z<tr>
c              3   &   K   | ]}|d          V  dS r   r'   r   s     r+   r   z-DedocBaseLoader._get_table.<locals>.<genexpr>  s&      %M%Mtd6l%M%M%M%M%M%Mr-   z<td	invisiblez style="display: none" z
 colspan="colspanz" rowspan="rowspanz">z</td>
z</tr>
z</tbody>
</table>)r`   htmlescape)r&   rq   rz   rowcellr{   	cell_texts          r+   rv   zDedocBaseLoader._get_table   s_   
> 	 	C # #chh&N&NW&N&N&NNNN
d"

$JJ 	 > 	$ 	$C("J 	 	 II%M%MtG}%M%M%MMM	 K	22	e#
$ <";;J>i > >Y> >+4> > >

 )#JJ**
:%%r-   N)__name__
__module____qualname____doc__strboolr	   intr   r=   r   r
   rT   r   rQ   rJ   r[   ri   rO   r   rv   r'   r-   r+   r   r      sz        (   -2*,#/!&,$*8=.348#'"&#RL RL RLRL 	RL
 RL  T	*RL %(RL !RL RL RL !$RL "RL &+39%5RL !d+RL "'sDy!1RL  C=!RL" 3-#RL$ 
%RL RL RL RLh
8H- 
 
 
 
, d    ^4 C    !6:	(	   & /3	I II I &d^	I
 
(	I I I IV& &sCx & & & & & &r-   r   c                       e Zd ZdZdefdZdS )DedocFileLoaderaw  
    DedocFileLoader document loader integration to load files using `dedoc`.

    The file loader automatically detects the file type (with the correct extension).
    The list of supported file types is gives at
    https://dedoc.readthedocs.io/en/latest/index.html#id1.
    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        Install ``dedoc`` package.

        .. code-block:: bash

            pip install -U dedoc

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocFileLoader

            loader = DedocFileLoader(
                file_path="example.pdf",
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    r#   c                 H    ddl m}  || j        | j        | j                  S )Nr   )make_manager_config)r"   parsing_paramsr   )dedoc.utils.langchainr   r"   r:   r   )r&   r   s     r+   rJ   zDedocFileLoader._make_config`  s>    ======""n2*
 
 
 	
r-   N)r   r   r   r   rQ   rJ   r'   r-   r+   r   r     s<        @ @D
d 
 
 
 
 
 
r-   r   c            #       @    e Zd ZdZdddddddd	d
d
dddddddededededeeef         dededededededeeef         deeef         deeef         dee         dee         ddf" fdZ	de
e         fdZdefd Zdeded!edeeeeeef         f         fd"Z xZS )#DedocAPIFileLoaderaU  
    Load files using `dedoc` API.
    The file loader automatically detects the file type (even with the wrong extension).
    By default, the loader makes a call to the locally hosted `dedoc` API.
    More information about `dedoc` API can be found in `dedoc` documentation:
        https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html

    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        You don't need to install `dedoc` library for using this loader.
        Instead, the `dedoc` API needs to be run.
        You may use Docker container for this purpose.
        Please see `dedoc` documentation for more details:
            https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker

        .. code-block:: bash

            docker pull dedocproject/dedoc
            docker run -p 1231:1231

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocAPIFileLoader

            loader = DedocAPIFileLoader(
                file_path="example.pdf",
                # url=...,
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    zhttp://0.0.0.0:1231r   TFr   r   r   r   r   N)urlr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r#   c                    t                                          ||||||||	|
||||||           || _        d| j        d<   dS )a
  Initialize with file path, API url and parsing parameters.

        Args:
            file_path: path to the file for processing
            url: URL to call `dedoc` API
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document is returned as a single langchain Document object
                    (don't split)
                "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
                "node": split document into tree nodes (title nodes, list item nodes,
                    raw text nodes)
                "line": split document into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        )r"   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   jsonreturn_formatN)superr=   r   r:   )r&   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   	__class__s                    r+   r=   zDedocAPIFileLoader.__init__  sp    B 	#-'A 3#9!5(C/$; 	 	
 	
 	
" 39000r-   c              #      K   |                      | j        | j        | j                  }|                     || j                  E d{V  dS )r?   )r   r"   rD   rE   N)
_send_filer   r"   r:   rO   r   )r&   doc_trees     r+   rT   zDedocAPIFileLoader.lazy_load	  sa      ??DNt?V # 
 
 ''hdj'QQQQQQQQQQQr-   c                     i S r   r'   rV   s    r+   rJ   zDedocAPIFileLoader._make_config  s    	r-   rD   c                    ddl }t          j                            |          }t	          |d          5 }d||fi}|                    | d||          }ddd           n# 1 swxY w Y   |j        dk    r)t          d|j        	                                           t          j        |j        	                                          }	|	S )	z7Send POST-request to `dedoc` API and return the resultsr   Nrbfilez/upload)filesdata   zError during file handling: )requestsospathbasenameopenpoststatus_coder<   rl   decoder   loads)
r&   r   r"   rD   r   	file_namer   r   rresults
             r+   r   zDedocAPIFileLoader._send_file  s    	G$$Y//	)T"" 	Mdi./EoooULLA	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M 	M =CPAI<L<L<N<NPPQQQAI,,..//s   "A""A&)A&)r   r   r   r   r   r   r	   r   r   r=   r   r
   rT   rQ   rJ   r   listr   __classcell__)r   s   @r+   r   r   j  s       G GZ ) -2*,#/!&,$*8=.348#'"&%S: S: S:S: 	S:
 S: S:  T	*S: %(S: !S: S: S: !$S: "S: &+39%5S: !d+S:  "'sDy!1!S:" C=#S:$ 3-%S:& 
'S: S: S: S: S: S:jR8H- R R R Rd    #&48	c5tS))	*       r-   r   )r   r   r   abcr   r   typingr   r   r   r   r	   langchain_core.documentsr
   )langchain_community.document_loaders.baser   r   r   r   r'   r-   r+   <module>r      sJ     				 # # # # # # # #              . - - - - - @ @ @ @ @ @H& H& H& H& H&j# H& H& H&VJ
 J
 J
 J
 J
o J
 J
 J
Zx x x x x x x x x xr-   