
    Ng\<                       d dl mZ d dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZmZ d dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZm Z m!Z! d d	l"m#Z# d d
l$m%Z%m&Z& erd dl'm(Z( d dl)m*Z* 	 	 	 dSdTdZ+dUd!Z,	 	 	 	 	 	 	 	 	 	 dVdWd/Z-dXd1Z.d2 Z/	 	 	 dYdZd<Z0d[d>Z1 ed?          Z2d\dCZ3d]dFZ4d^dHZ5d_dLZ6	 	 	 d`dadRZ7dS )b    )annotationsN)BufferedReaderBytesIOTextIOWrapper)SpooledTemporaryFile)sleep)IOTYPE_CHECKINGAnyOptionalTypeVarcast)CoordinateSystem
PixelSpace)	TYPE_TO_TEXT_ELEMENT_MAPCheckBoxCoordinatesMetadataElementElementMetadataElementTypeListItem	PageBreakText)logger)ENUMERATED_BULLETS_REUNICODE_BULLETS_RE)
PageLayout)LayoutElementThtmllayout_element(LayoutElement | Element | dict[str, Any]coordinate_systemOptional[CoordinateSystem]infer_list_itemsboolsource_formatOptional[str]returnElement | list[Element]c                   t          | t                    r|dk    r| S t          | t                    rt          d          S t          | t                    s|                                 }n| }|                    dd          }|                    d          }|                    d          }|                    d          }|                    dd	          }	d	}
|	r|	j        }
|rKt          |t          t          t          t          j        f          rt          t          |          
          }nt                      }||||
d}|t          j        k    r|rt          |fi |S t!          dd|i|S |t"          v rit          |t                    sJ t"          |         } |dd|i|}|t          j        k    rd|j        _        n|t          j        k    rd|j        _        |S |t          j        t          j        t          j        t          j        t          j        t          j        fv r3|t          j        t          j        t          j        fv }t9          dd|i|S t;          dd|i|S )zSConverts an unstructured_inference LayoutElement object to an unstructured Element.r    textr-   coordinatestypeprobsourceN)detection_class_prob)r.   r"   metadatadetection_origin      checked )
isinstancer   r   dictto_dictgetvalueintstrfloatnumbersNumberr   r   LISTlayout_list_to_list_itemsr   r   HEADLINEr3   category_depthSUB_HEADLINECHECK_BOX_CHECKEDCHECK_BOX_UNCHECKEDRADIO_BUTTON_CHECKEDRADIO_BUTTON_UNCHECKEDCHECKED	UNCHECKEDr   r   )r    r"   r$   r&   layout_dictr-   r.   element_typer0   
aux_originoriginclass_prob_metadatacommon_kwargs_element_classr7   s                  `/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/common/common.pynormalize_layout_elementrV   !   s    .'** }/F/F .),, "b!!!!nd++ %$,,..$??62&&D //-00K??6**L??6""D400JF "! 0
4#sE7>!BCC 0-5;;OOO-//".'"	 M {''' 		,   
     
 
1	1	1,,,,,,1,?' 
 


 
 ;///56N#22[55556N#2	%'(* 
 
 ),#
 

  
 


 
 	

  
 


 
 	
    r-   r.   )Optional[tuple[tuple[float, float], ...]]r3   Optional[ElementMetadata]r4   list[Element]c                Z   | rt          j        |           ng }t          |          dk    r| rt          j        |           ng }g }|D ]b}t          |                                          dk    r;t          |                                ||||          }|                    |           c|S )z=Converts a list LayoutElement to a list of ListItem elements.r5   r   )r-   r.   r"   r3   r4   )r   splitlenr   stripr   append)	r-   r.   r"   r3   r4   split_items
list_itemstext_segmentitems	            rU   rD   rD   y   s     8<C'-d333K
;18<D(.t444" "J# $ $|!!##$$q(( !''))'"3!!1  D d###rW   elementr   filenamefiletypepage_numberOptional[int]urltext_as_html
image_path	languagesOptional[list[str]]kwargsr   c                V   ||t          ||          nd}t          | d          rt          | j                  dk    r| j        nd}|rd |D             nd}|rd |D             nd}|rd |D             nd}t          | d          rt          | j                  dk    r| j        nd}|rd	 |D             nd}|rd
 |D             nd}| j        j        r| j        j        nd}t          ||||||||||||||
          }| j                            |           |	|	| j        _	        | S )zAdds document metadata to the document element.

    Document metadata includes information like the filename, source url, and page number.
    N)pointssystemlinksr   c                8    g | ]}|                     d           S )ri   r<   .0links     rU   
<listcomp>z(add_element_metadata.<locals>.<listcomp>   s"    333T%333rW   c                8    g | ]}|                     d           S r,   rt   ru   s     rU   rx   z(add_element_metadata.<locals>.<listcomp>   s$    555t$((6""555rW   c                8    g | ]}|                     d           S )start_indexrt   ru   s     rU   rx   z(add_element_metadata.<locals>.<listcomp>   s$    DDDd$((=11DDDrW   emphasized_textsc                8    g | ]}|                     d           S r,   rt   rv   emphasized_texts     rU   rx   z(add_element_metadata.<locals>.<listcomp>   s&    MMM		V	$	$MMMrW   c                8    g | ]}|                     d           S )tagrt   r~   s     rU   rx   z(add_element_metadata.<locals>.<listcomp>   s&    LLL		U	#	#LLLrW   )r.   re   rf   rg   ri   rj   	link_urls
link_textslink_start_indexesemphasized_text_contentsemphasized_text_tagsrF   rk   rl   )
r   hasattrr]   rr   r|   r3   rF   r   updater4   )rd   re   rf   rg   ri   rj   r.   r"   rk   r4   rl   rn   coordinates_metadatarr   r   r   r   r|   r   r   depthr3   s                         rU   add_element_metadatar      s   0 "'8'D	 	$	
 	
 	
 	

   %Wg66[3w};M;MPQ;Q;QGMMW[E7<F33U3333$I9>H55u5555DJHMWDDeDDDDSW 7.//	478P4Q4QTU4U4U 	    	MM<LMMMM  	LL;KLLLL 
 07/?/NXG++TXE(!-!91  H  H%%%#,<)NrW   layout_elementsc                    g }t                      }| D ]d}t          |          }t          |t                    r"|D ]	}||_        
|                    |           H||_        |                    |           e|S )zRemoves document metadata from the document element.

    Document metadata includes information like the filename, source url, and page number.
    )r   rV   r9   listr3   extendr_   )r   elementsr3   r    rd   _elements         rU   remove_element_metadatar      s    
 !H  H) % %*>::gt$$ 	%# - -$,!!OOG$$$$'GOOG$$$$OrW   c                     t          j                    D ]Z} 	 d|                                                                 v r dS /# t           j        t           j        t           j        f$ r Y Ww xY wdS )NsofficeTF)psutilprocess_iternamelowerNoSuchProcessAccessDeniedZombieProcess)procs    rU   _is_soffice_runningr      s    #%%  	DIIKK--////tt 0$f&96;OP 	 	 	D	5s   (A&A,+A,docx
   input_filenamer?   output_directorytarget_formattarget_filterwait_for_soffice_ready_time_outr>   c                   || d| }ddd|d|| g}	 d}d}t          j        |d	
          }|j                                                                        }	||k     rv|	dk    rp||z  }t                      rt          |           nAt          j        |d	
          }|j                                                                        }	||k     r|	dk    pn# t          $ r t          d          w xY wt          j	        |	           |j
        dk    s|	dk    rZt          j        d||j
                   t          j        |j                                                                                   dS dS )a  Converts a .doc/.ppt file to a .docx/.pptx file using the libreoffice CLI.

    Parameters
    ----------
    input_filename: str
        The name of the .doc file to convert to .docx
    output_directory: str
        The output directory for the convert .docx file
    target_format: str
        The desired output format
    target_filter: str
        The output filter name to use when converting. See references below
        for details.
    wait_for_soffice_ready_time_out: int
        The max wait time in seconds for soffice to become available to run

    References
    ----------
    https://stackoverflow.com/questions/52277264/convert-doc-to-docx-using-soffice-not-working
    https://git.libreoffice.org/core/+/refs/heads/master/filter/source/config/fragments/filters

    N:r   z
--headlessz--convert-toz--outdirr   g?T)capture_outputr+   a  soffice command was not found. Please install libreoffice
on your system and try again.

- Install instructions: https://www.libreoffice.org/get-help/install-howto/
- Mac: https://formulae.brew.sh/cask/libreoffice
- Debian: https://wiki.debian.org/LibreOfficez3soffice failed to convert to format %s with code %i)
subprocessrunstdoutdecoder^   r   r   FileNotFoundErrorr   info
returncodeerrorstderr)
r   r   r   r   r   command	wait_time
sleep_timeoutputmessages
             rU   convert_office_docr      s   :  (::=:: 	G
	
===-&&((..00 :::B#I"$$ 9j!!!!#EEE -..006688 :::B  
 
 
1
 
 	

 KABA=RXRc	
 	
 	
 	V]))++113344444	 "/s   CC C0Nonec                 V   t          d |                                 D                       dk    ryt          |                                           }t	          |          dk    r+dd                    |dd                    d|d          d}n|d	          d}t          |          dS )
z
    Verify arguments; exactly one of all keyword arguments must not be None.

    Example:
        >>> exactly_one(filename=filename, file=file, text=text, url=url)
    c                "    g | ]}|d uo|dk    S )Nr+   r8   )rv   args     rU   rx   zexactly_one.<locals>.<listcomp>O  s%    GGGS_*GGGrW   r5   zExactly one of z, Nz and z must be specified.r   )sumvaluesr   keysr]   join
ValueError)rn   namesr   s      rU   exactly_oner   H  s     GGv}}GGGHHAMMV[[]]##u::>>b		%*(=(=bbE"IbbbGGq666G!!! NMrW   _Tfile _T | SpooledTemporaryFile[bytes]_T | BytesIOc                    t          | t                    rI|                     d           t          t	          t
          |                                                     S | S )a  Convert `file` to `BytesIO` when it is a `SpooledTemporaryFile`.

    Note that `file` does not need to be IO[bytes]. It can be `None` or `bytes` and this function
    will not complain.

    In Python <3.11, `SpooledTemporaryFile` does not implement `.readable()` or `.seekable()` which
    triggers an exception when the file is loaded by certain packages. In particular, the stdlib
    `zipfile.Zipfile` raises on opening a `SpooledTemporaryFile` as does `Pandas.read_csv()`.
    r   )r9   r   seekr   r   bytesread)r   s    rU   spooled_to_bytes_io_if_neededr   [  sM     $,-- 1		!tE499;;//000 KrW   bytes | IO[bytes]r   c                   t          | t                    r| S t          | t                    r@|                     d           |                                 }|                     d           |S t          | t
                    r|                                 S t          | t          t          f          rAt          | j
        d          5 }|                                cddd           S # 1 swxY w Y   t          d          )zExtract the bytes from `file` without preventing it from being read again later.

    As a convenience to simplify client code, also returns `file` unchanged if it is already bytes.
    r   rbNzInvalid file-like object type)r9   r   r   r   r   r   getvaluer   r   openr   r   )r   f_bytesfs      rU   convert_to_bytesr   m  s   
 $ $,-- 		!))++		!$   }}$788 $)T"" 	a6688	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 4
5
55s   C((C,/C,sc                D    t          t          j        |                     S )z
    Check if the input string contains any emoji characters.

    Parameters:
    - s (str): The input string to check.

    Returns:
    - bool: True if the string contains any emoji, False otherwise.
    )r%   emojiemoji_count)r   s    rU   contains_emojir     s     !!$$%%%rW   pager   dict[str, Any]c                   t          | dd          }t          | dd          }|r|j        }|j        }|j        }nH|r@|                    d          }|                    d          }|                    d          }nd}d}d}|||dS )z:Retrieve image metadata and coordinate system from a page.imageNimage_metadataformatwidthheight)r   r   r   )getattrr   r   r   r<   )r   r   r   image_formatimage_widthimage_heights         rU   get_page_image_metadatar     s     D'4((ET#3T::N |k|	 %))(33$((11%))(33   rW   ocr_datalist['LayoutElement']
image_sizetuple[int | float, int | float]common_metadatac                    |\  }}t          ||          }g }| D ]J}	t          |	|||r|nd          }
|r|
j                            |           |                    |
           K|S )zNConvert OCR layout data into `unstructured` elements with associated metadata.)r   r   r   )r"   r$   r&   )r   rV   r3   r   r_   )r   r   r   r$   r&   r   r   r"   r   r    rd   s              rU   ocr_data_to_elementsr     s     !+K"\JJJ H" ! !*/-+8D--f	
 
 
  	5##O444    OrW   )NTr   )
r    r!   r"   r#   r$   r%   r&   r'   r(   r)   )r-   r'   r.   rX   r"   r#   r3   rY   r4   r'   r(   rZ   )
NNNNNNNNNN)rd   r   re   r'   rf   r'   rg   rh   ri   r'   rj   r'   r.   rX   r"   r#   rk   r'   r4   r'   rl   rm   rn   r   r(   r   )r   rZ   r(   rZ   )r   Nr   )
r   r?   r   r?   r   r?   r   r'   r   r>   )rn   r   r(   r   )r   r   r(   r   )r   r   r(   r   )r   r?   r(   r%   )r   r   r(   r   )NTN)r   r   r   r   r   rY   r$   r%   r&   r'   r(   rZ   )8
__future__r   rA   r   ior   r   r   tempfiler   timer   typingr	   r
   r   r   r   r   r   r   "unstructured.documents.coordinatesr   r   unstructured.documents.elementsr   r   r   r   r   r   r   r   r   unstructured.loggerr   unstructured.nlp.patternsr   r   'unstructured_inference.inference.layoutr   .unstructured_inference.inference.layoutelementr   rV   rD   r   r   r   r   r   r   r   r   r   r   r   r8   rW   rU   <module>r      s   " " " " " "      5 5 5 5 5 5 5 5 5 5 ) ) ) ) ) )       B B B B B B B B B B B B B B B B   K K K K K K K K
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ' & & & & & O O O O O O O O MBBBBBBLLLLLL
 59!#)	U
 U
 U
 U
 U
p   @ #"!%"&=A48 $&*%)C C C C CL   &    #'+-K5 K5 K5 K5 K5\" " " "  WT]]   $6 6 6 60& & & &   : 26!#'      rW   