
    NgP5                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZ dZd	Zd
ZdZdZdZdZdZdZ  ej!        e"          Z# eddd           G d dee                      Z$dS )    N)Path)AnyDictListMappingOptionalSequenceUnion)
deprecated)Document)	BaseModelmodel_validator)
BaseLoaderz#{http://www.w3.org/1999/xhtml}tablexpathidsourcename	structuretagprojectsz#https://api.docugami.com/v1preview1z0.0.24z1.0z!docugami_langchain.DocugamiLoader)sinceremovalalternative_importc                      e Zd ZU dZeZeed<   	 ej	        
                    d          Zee         ed<   	 dZeed<   	 dZeed<   	 d	Zeed
<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZee         ed<   	 dZeee                  ed<   	 eeeeef                           ed<   	 dZeed<   	  ed          edeee f         de fd                        Z!	 	 d*de"dee         d ee#         de$e%         fd!Z&dede$e         fd"Z'dede$e         fd#Z(d$edefd%Z)	 	 d*d&ededee         d'ee#         de$e%         f
d(Z*de$e%         fd)Z+dS )+DocugamiLoaderzdLoad from `Docugami`.

    To use, you should have the ``dgml-utils`` python package installed.
    apiDOCUGAMI_API_KEYaccess_tokeni   max_text_length    min_text_lengthi   max_metadata_lengthFinclude_xml_tagsr   parent_hierarchy_levelsdoc_idparent_id_keysub_chunk_tablesTwhitespace_normalize_textN	docset_iddocument_ids
file_paths(include_project_metadata_in_doc_metadatabefore)modevaluesreturnc                 \   |                     d          r$|                     d          rt          d          |                     d          s$|                     d          st          d          |                     d          r$|                     d          st          d          |S )zValidate that either local file paths are given, or remote API docset ID.

        Args:
            values: The values to validate.

        Returns:
            The validated values.
        r+   r)   z7Cannot specify both file_paths and remote API docset_idz6Must specify either file_paths or remote API docset_idr   z7Must specify access token if using remote API docset_id)get
ValueError)clsr/   s     i/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/docugami.pyvalidate_local_or_remotez'DocugamiLoader.validate_local_or_remoteT   s     ::l## 	X

;(?(? 	XVWWWzz,'' 	W

;0G0G 	WUVVV::k"" 	X6::n+E+E 	XVWWW    contentdocument_nameadditional_doc_metadatac           	          	 ddl m} n# t          $ r t          d          w xY w	 ddlm} ddlm} n# t          $ r t          d          w xY wd|dt          f fd	}|                    t          j
        |                    }|                                }	 ||	 j         j         j         j         j         j        
          }
i }|
D ]} ||          }|j                            t(                    }|rX|||<   |j        rL ||j                  }|j                            t(                    }|r|j        r||j         j        <   |||<   t1          |                                          S )z6Parse a single DGML document into a list of Documents.r   etreePCould not import lxml python package. Please install it with `pip install lxml`.)Chunk)
get_chunkszaCould not import from dgml-utils python package. Please install it with `pip install dgml-utils`.dg_chunkr0   c                 z   t          j        | j                                                                                  }t
          | j        t          |t          t          t          | j        t          | j        i}| j        }rj        r|                               t!          |d j                 |          S )N)page_contentmetadata)hashlibmd5textencode	hexdigest	XPATH_KEYr   ID_KEYDOCUMENT_NAME_KEYDOCUMENT_SOURCE_KEYSTRUCTURE_KEYr   TAG_KEYr   r,   updater   r   )rA   
_hashed_idrD   rG   r:   r9   selfs       r5   _build_framework_chunkz:DocugamiLoader._parse_dgml.<locals>._build_framework_chunk   s     X]%9%9%;%;<<FFHHJ8>
!=#]x1H =D& =@ =OO$;<<<!"8D$8"89!   r7   )r!   r   r(   r'   r#   r$   )lxmlr=   ImportErrordgml_utils.modelsr?   dgml_utils.segmentationr@   r   parseioBytesIOgetrootr!   r   r(   r'   r#   r$   rD   r2   rK   parentrC   r&   listr/   )rR   r8   r9   r:   r=   r?   r@   rS   treeroot	dg_chunksframework_chunksrA   framework_chunkchunk_idframework_parent_chunk	parent_ids   ` ``             r5   _parse_dgmlzDocugamiLoader._parse_dgmlj   s    	""""""" 	 	 	=  		//////::::::: 	 	 	C  		U 	x 	 	 	 	 	 	 	 	. {{2:g..//||~~J 0 0&*&D!2!2$($@
 
 
	 13! 
	M 
	MH44X>>O&/33F;;H M-< *? M-C-CHO-T-T* 6 ? C CF K KI  M%;%H MGP01CD6L(3$++--...s    &7 Ac                 <   | j          d| d}g }|rt          j        |dd| j         i          }|j        rF|                                }|                    |d                    |                    dd          }nt          d	| d
|j         d          ||S )z1Gets all document details for the given docset ID	/docsets/z
/documentsAuthorizationBearer )headers	documentsnextNFailed to download 
 (status: ))	r   requestsr2   r   okjsonextend	Exceptionstatus_code)rR   r)   urlall_documentsresponsedatas         r5   _document_details_for_docset_idz.DocugamiLoader._document_details_for_docset_id   s    99I999 	|(*GD4E*G*GH  H { }}$$T+%6777hhvt,,P#PP9MPPP    	 r7   c                 >   | j          d| }g }|rt          j        d|dd| j         ii           }|j        rF|                                }|                    |d                    |                    dd          }nt          d	| d
|j	         d          ||S )z0Gets all project details for the given docset IDz/projects?docset.id=GETri   rj   rk   rz   r   rm   Nrn   ro   rp   )
r   rq   requestr   rr   rs   rt   r2   ru   rv   )rR   r)   rw   all_projectsry   rz   s         r5   _project_details_for_docset_idz-DocugamiLoader._project_details_for_docset_id   s    ::y:: 	'(*GD4E*G*GH	  H { }}##D$4555hhvt,,P#PP9MPPP    	  r7   projectc                 
   |                     t                    }| j         d| d}g }i }|rt          j        d|dd| j         ii           }|j        rF|                                }|                    |d                    |                     dd	          }n(|j	        d
k    r|S t          d| d|j	         d          ||D ]}|                     d          }	|                     d          }
|                     d          }|	dk    ro|
rl|ri|t                   }i }t          j        d|
 ddd| j         ii           }|j        r	 ddlm} n# t          $ r t          d          w xY w|                    t          j        |j                            }|                                }|j        }|                    d|          }|D ]}|                    d|          d         j        }d                    |                    d|          d                                                                                   }|d	| j                 ||<   |||<   t          d|
 ddz             |S )z#Gets project metadata for all filesz
/projects/z/artifacts/latestr}   ri   rj   r~   	artifactsrm   Ni  rn   ro   rp   r   rw   documentzreport-values.xmlz/contentr   r<   r>   z
//pr:Entry)
namespacesz./pr:Heading z
./pr:Valuez	/content z (status: {response.status_code}))r2   rK   r   rq   r   r   rr   rs   rt   rv   ru   rT   r=   rU   rX   rY   rZ   r8   r[   nsmapr   rG   joinitertextstripr"   )rR   r   
project_idrw   all_artifactsper_file_metadatary   rz   artifactartifact_nameartifact_urlartifact_docr%   rD   r=   artifact_treeartifact_rootnsentriesentryheadingvalues                         r5   _metadata_for_projectz$DocugamiLoader._metadata_for_project   s!   [[((
BBZBBB"$ 	'(*GD4E*G*GH	  H { 
}}$$T+%6777hhvt,,%,,((P#PP9MPPP    	& & (	 (	H$LL00M#<<..L#<<
33L 33333%f-!# $+#---,.K8I.K.KL	   ; .......&   )I  
 %*KK
8;K0L0L$M$MM$1$9$9$;$;M&,B+11,21NNG!( N N"'++n+"L"LQ"O"T #!KKKDDQGPPRR! !%''  -22LD4L2L,M))08%f--#ElEEE<=  
 ! s   EE6document_idadditional_metadatac                     | j          d| d| d}t          j        d|dd| j         ii           }|j        r|                     |j        ||          S t          d	| d
|j         d          )zLoad chunks for a document.rh   z/documents/z/dgmlr}   ri   rj   r~   )r8   r9   r:   rn   ro   rp   )	r   rq   r   r   rr   rf   r8   ru   rv   )rR   r   r)   r9   r   rw   ry   s          r5   _load_chunks_for_documentz(DocugamiLoader._load_chunks_for_document(  s     LLILL+LLL#$&C0A&C&CD	
 
 
 ; 		## (+(; $    LcLLX5ILLL  r7   c                 ,    g } j         r j        r                      j                  } j        r fd|D             }                      j                  }i }|rW j        rP|D ]M}                     |          }|D ]3}||vr||         ||<   ||                             ||                    4N|D ]_}|t                   }	|	                    t                    }
|	                    |	          }|                     |	 j        |
|          z  }`ny j        rr j        D ]j}t          |          }t          |d          5 }|                     |                                |j                  z  }ddd           n# 1 swxY w Y   k|S )zLoad documents.c                 <    g | ]}|t                    j        v |S  )rK   r*   ).0drR   s     r5   
<listcomp>z'DocugamiLoader.load.<locals>.<listcomp>L  s1     % % %AfIAR4R4RA4R4R4Rr7   )r   r)   r9   r   rb)r8   r9   N)r   r)   r{   r*   r   r,   r   rP   rK   r2   rL   r   r+   r   openrf   readr   )rR   chunks_document_details_project_detailscombined_project_metadatar   rD   file_iddocr%   doc_namedoc_metadatapathfiles   `             r5   loadzDocugamiLoader.loadD  sM   !# '	 '	 $ D DT^ T T  % % % %0% % %!  $BB4>RR9;% 	YD$Q 	Y  0 Y YG#99'BBH#+ Y Y"*CCCAI'AR5g>>5g>EEhwFWXXXX	Y ) 	 	V77#4558<<VDD$88 &"n"*(4	 9   		 _ 	  Dzz$%% d.. $		&*i /   F               s   
2FF	F	)NN),__name__
__module____qualname____doc__DEFAULT_API_ENDPOINTr   str__annotations__osenvironr2   r   r   r   intr!   r"   r#   boolr$   r&   r'   r(   r)   r*   r	   r
   r   r,   r   classmethodr   r   r6   bytesr   r   r   rf   r{   r   r   r   r   r   r7   r5   r   r      s         
 $C###+"$*..1C"D"DL(3-DDD/OS,OSS""""/"d"""8#$S$$$K!M3!!!)"d"""9&*t***'  $Ix}###,,0L(8C=)000/%c	"234444&59,d999V_(###d38n     [ $#. (,59	I/ I/I/  }I/ "*'!2	I/
 
hI/ I/ I/ I/V d    * T
    .E!T E!d E! E! E! E!V (,15    }	
 &g. 
h   8-d8n - - - - - -r7   r   )%rE   rY   loggingr   pathlibr   typingr   r   r   r   r   r	   r
   rq   langchain_core._api.deprecationr   langchain_core.documentsr   pydanticr   r   )langchain_community.document_loaders.baser   
TABLE_NAMErJ   rK   rM   rL   rN   rO   PROJECTS_KEYr   	getLoggerr   loggerr   r   r7   r5   <module>r      ss    				  				       F F F F F F F F F F F F F F F F F F  6 6 6 6 6 6 - - - - - - / / / / / / / / @ @ @ @ @ @2
		  
< 		8	$	$ 
:  
N N N N NZ N N 
N N Nr7   