
    Ngq                         d dl Z d dlmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ  e j        e          Z G d de          Z G d d	e
          ZdS )
    N)DictIteratorListUnion)Document)BaseBlobParser)Blobc                       e Zd ZdZdS )ServerUnavailableExceptionz7Exception raised when the Grobid server is unavailable.N)__name__
__module____qualname____doc__     o/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/document_loaders/parsers/grobid.pyr   r      s        AADr   r   c            	       j    e Zd ZdZ	 ddededdfdZded	ededee         fd
Z	de
dee         fdZdS )GrobidParserz)Load  article `PDF` files using `Grobid`.1http://localhost:8070/api/processFulltextDocumentsegment_sentencesgrobid_serverreturnNc                     || _         || _        	 t          j        |           d S # t          j        j        $ r" t                              d           t          w xY w)NzyGROBID server does not appear up and running,                 please ensure Grobid is installed and the server is running)	r   r   requestsget
exceptionsRequestExceptionloggererrorr   )selfr   r   s      r   __init__zGrobidParser.__init__   st    
 "3*	-L'''''"3 	- 	- 	-LLM   -,	-s	   & 6A	file_pathxml_datac              #     K   	 ddl m} n# t          $ r t          d          w xY w ||d          }|                    d          }|                    d          }|r|d         j        ndg }|D ]4}	|	                    d          }
|
t          |	                    d
                    D ]\  }}g }g }t          |                    d                    D ]C\  }}|                    |j                   g }|                    d          |                    d          	                    d          D ]P}|	                    d          }|                    |d         |d         |d         |d         |d         d           Q|                    |           |du rzt          |          dk    rg|d         d         |d         d         }}|j        t          |          |g|
j        |
                    d          ||fd}|                    |           E|dur|d         d         d         |d         d         d         }}d                    |          t          |          ||
j        |
                    d          ||fd}|                    |           6fd|D             E d	{V  d	S )z!Process the XML file from Grobin.r   )BeautifulSoupzA`bs4` package not found, please install it with `pip install bs4`xmldivtitlezNo title foundheadNpscoords;,            )pagexyhwTr3   n)textparabboxessection_titlesection_numberpages c                    g | ]}t          |d          t          t          |d                    t          |d                   t          |d                   t          |d                   t          |d                   t          |d                   t                    t                    d                    S )r:   r;   r<   r?   r=   r>   )r:   r;   r<   r?   r=   r>   paper_titler"   )page_contentmetadata)r   dictstr).0chunkr"   r(   s     r   
<listcomp>z,GrobidParser.process_xml.<locals>.<listcomp>i   s     
 
 
   "6] #E&M 2 2 #E&M 2 2"%eHo"6"6!$U7^!4!4),U?-C)D)D*-e4D.E*F*F'*5zz%(^^	 	   
 
 
r   )bs4r%   ImportErrorfind_allr:   find	enumerateappendr   splitlenrF   join)r    r"   r#   r   r%   soupsectionstitleschunkssectionsecti	paragraphchunk_bboxesparagraph_textsentencesbboxesbboxboxfpagelpagesentence_dictparagraph_dictr(   s    `                     @r   process_xmlzGrobidParser.process_xml&   sU     
	))))))) 	 	 	V  	 }Xu--==''w'' 	%1INEE$E .	6 .	6G<<''D$-g.>.>s.C.C$D$D +6 +6LAy#%L%'N'01C1CC1H1H'I'I 9 98&--hm<<<"$#<<11=(0X(>(>(D(DS(I(I 
" 
"&*jjoo '03A-0V-0V-0V-0V%& %&!" !" !" !" )//888-55CLL1<L<L+21:f+=wr{6?R5E(0(+A+2)1526((3--*/- -M #MM-888(44(OA.v6(,R08  %
 %'GGN$;$;$'FF&2-1Y.2hhsmm&+U^* * n555
 
 
 
 
   !
 
 
 	
 	
 	
 	
 	
 	
 	
 	
 	
s    'blobc           	         |j         }|t          d          t          |d          }d||dddifi}	 i }dD ]}d||<   d	d
g|d<   |pi }t          j        d| j        d d ||d          }|j        }n6# t          j        j        $ r t          
                    d           d }Y nw xY w|t          g           S |                     ||| j                  S )Nzblob.source cannot be None.rbinputzapplication/pdfExpires0)generateIDsconsolidateHeadersegmentSentences1r)   r+   teiCoordinatesPOST<   )headersparamsfilesdatatimeoutz%GROBID server timed out. Return None.)source
ValueErroropenr   requestr   r:   r   ReadTimeoutr   r   iterre   r   )	r    rf   r"   pdfru   rv   paramrr#   s	            r   
lazy_parsezGrobidParser.lazy_parse|   s'   K	:;;;9d##9c+<y#>NOP	57DQ " "!U&,c]D!"KRE "  A vHH". 	 	 	LL@AAAHHH	 88O##Ix9OPPPs   >A3 30B&%B&)r   )r   r   r   r   boolrF   r!   r   r   re   r	   r   r   r   r   r   r      s        33
 Q- -- - 
	- - - - T
T
(+T
@DT
	(	T
 T
 T
 T
lQt Q(: Q Q Q Q Q Qr   r   )loggingtypingr   r   r   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   1langchain_community.document_loaders.blob_loadersr	   	getLoggerr   r   	Exceptionr   r   r   r   r   <module>r      s     . . . . . . . . . . . .  - - - - - - D D D D D D B B B B B B		8	$	$	 	 	 	 	 	 	 	FQ FQ FQ FQ FQ> FQ FQ FQ FQ FQr   