
    Ng/A                    $   d dl mZ d dlZd dlmZmZmZ d dlZd dlm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZmZ g dZi d	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*i d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLi dMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrZd!gfddwZddyZdd~ZddZddZddZddZdgfddZ	 dddZddZdS )    )annotationsN)IterableIteratorOptional)DetectorFactorydetect_langslang_detect_exception)Element)logger)TESSERACT_LANGUAGES_AND_CODESTESSERACT_LANGUAGES_SPLITTER)~aframharaasmazeaze_cyrlbelbenbodbosbrebulcatcebceschi_simchi_sim_vertchi_trachi_tra_vertchrcoscymdandeudivdzoellengenmepoequesteusfaofasfilfinfrafrkfrmfryglagleglggrcgujhathebhinhrvhunhyeikuindislitaita_oldjavjpnjpn_vertkankatkat_oldkazkhmkirkmrkorkor_vertlaolatlavlitltzmalmarmkdmltmonmrimsamyanepnldnorocioriosdpanpolporpusqueronrussansinslkslvsndsnumspaspa_oldsqisrpsrp_latnsunswaswesyrtamtatteltgkthatirtonturuigukrurduzbuzb_cyrlvieyidyorr   afr   arr   azr   ber   bsr   bgr   csr   chr   chinese_chtr#   cyr$   dar%   germanr)   enr-   etr0   far3   frr8   gar>   hir?   hrr@   hurC   idrD   isrE   itrH   japanrQ   koreanrP   kurT   rs_latinrU   lvrV   ltrY   mrr[   mtr^   msr`   nera   nlrb   norg   plrh   ptrk   rorl   ruro   skrp   slrs   esru   sqrv   rs_cyrillicry   swrz   svr|   tar~   ter   trr   ugr   ukuruzvi)r   r   r   	languagesOptional[list[str]]returnstrc                    | t          d          d d | D             D             }t          t                              |                    }t	          |          dk    rt          j        d|  d           dS t          j        |          S )	zf
    Entry point: convert languages (list of strings) into tesseract ocr langcode format (uses +)
    Nz`languages` can not be `None`c                    g | ]}||S  r   ).0	lang_codes     ^/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/partition/common/lang.py
<listcomp>z3prepare_languages_for_tesseract.<locals>.<listcomp>   s0              c              3  4   K   | ]}t          |          V  d S )N)/_convert_language_code_to_pytesseract_lang_coder   langs     r   	<genexpr>z2prepare_languages_for_tesseract.<locals>.<genexpr>   s<       
 
FJ;DAA
 
 
 
 
 
r   r   z@Failed to find any valid standard language code from languages: z, proceed with `eng` instead.r)   )	
ValueErrorlistdictfromkeyslenr   warningr   join)r   converted_languagess     r   prepare_languages_for_tesseractr      s     8999 
 
NW
 
 
   t}}-@AABB
1$$C#C C C	
 	
 	
 u',-@AAAr   tesseract_languagec                    t                               |                                           }|st          j        |  d           dS |S )z
    Convert TesseractOCR language code to PaddleOCR language code.

    :param tesseract_language: str, language code used in TesseractOCR
    :return: str, corresponding language code for PaddleOCR or None if not found
    zM is not a language code supported by PaddleOCR, proceeding with `en` instead.r   )#PYTESSERACT_TO_PADDLE_LANG_CODE_MAPgetlowerr   r   )r   r   s     r   tesseract_to_paddle_languager      s^     /223E3K3K3M3MNND ! - - -	
 	
 	
 tKr   	list[str]ocr_languagesstr | list[str] | Nonelist[str] | Nonec                d   |r#t          |          }t          j        d           |t          |t                    sJ |rd|v rt          d          t          | t                    st          d          |r>| dgk    s	| dgk    s| s.|                    t                    } t          j        d           | rvd| vrmt          |           D ]/\  }}t          j        |                                |          | |<   0t          |           }|sdS |                    t                    } ndg} | S dS )a  Handle users defining both `ocr_languages` and `languages`.

    Give preference to `languages` and convert `ocr_languages` if needed, but default to `None`.

    `ocr_languages` is only a parameter for `auto.partition`, `partition_image`, & `partition_pdf`.
    `ocr_languages` should not be defined as 'auto' since 'auto' is intended for language detection
    which is not supported by `partition_image` or `partition_pdf`.
    zmThe ocr_languages kwarg will be deprecated in a future version of unstructured. Please use languages instead.Nautoa  `ocr_languages` is deprecated but was used to extract text from pdfs and images. The 'auto' argument is only for language *detection* when it is assigned to `languages` and partitioning documents other than pdfs or images. Language detection is not currently supported in pdfs or images.zOThe language parameter must be a list of language codes as strings, ex. ['eng'] z}Only one of languages and ocr_languages should be specified. languages is preferred. ocr_languages is marked for deprecation.)_clean_ocr_languages_argr   r   
isinstancer   r   r   	TypeErrorsplitr   	enumerater   r   r   )r   r   ir   str_languagess        r   check_language_argsr      s     
0??,	
 	
 	
  J}c$B$B    
=00P
 
 	
 i&& 
]
 
 	

  
)x//93D3DI3D!''(DEE	O	
 	
 	
  ""$Y// U U4<@tTT	!4Y??M  t%++,HIIII
  I4r   c                6    |                      t                    S )z
    Convert ocr_languages parameter to list of langcode strings.
    Assumption: ocr_languages is in tesseract plus sign format
    )r   r   r   s    r   &convert_old_ocr_languages_to_languagesr   <  s     ;<<<r   r   c                   | t           v r| S t          |           }d t           D             }|r|j        |v r(t          |j                  }t	          j        |          S |j        |v r(t          |j                  }t	          j        |          S |j        |v r(t          |j                  }t	          j        |          S t          j	        |  d           dS t          j	        |  d           dS )zq
    Convert a single language code to its tesseract formatted and recognized
    langcode(s), if supported.
    c                "    h | ]}|d d         S )N   r   r   s     r   	<setcomp>zB_convert_language_code_to_pytesseract_lang_code.<locals>.<setcomp>T  s     GGG48GGGr   z* is not a language supported by Tesseract.r   )
PYTESSERACT_LANG_CODES_get_iso639_language_objectpart3(_get_all_tesseract_langcodes_with_prefixr   r   part2bpart2tr   r   )r   lang_iso639pytesseract_langs_3matched_langcodess       r   r   r   E  s    %%%-d33K HG0FGGG  333 HIZ [ [/45FGGG #666 HI[ \ \/45FGGG #666 HI[ \ \/45FGGG NdNNNOOO2
NdFFFGGG2r   Optional[iso639.Language]c                    	 t           j                            |                                           S # t           j        $ r t          j        |  d           Y d S w xY w)Nz' is not a valid standard language code.)iso639Languagematchr   LanguageNotFoundErrorr   r   )r   s    r   r   r   m  sc    $$TZZ\\222'   $GGGHHHtts   03 &AAprefixc                *      fdt           D             S )zb
    Get all matching tesseract langcodes with this prefix (may be one or multiple variants).
    c                >    g | ]}|                               |S r   )
startswith)r   langcoder  s     r   r   z<_get_all_tesseract_langcodes_with_prefix.<locals>.<listcomp>y  s-    [[[x?R?RSY?Z?Z[H[[[r   )r   )r  s   `r   r   r   u  s!     \[[[%;[[[[r   r   textc                   t          |t                    st          d          |d         dk    s|                                 dk    rdS t	          j        d|           r(t          |                                           dk     rdgS dt          _	        g }|red|vra|D ]\}t          j        |                                |          }t          |dd	                   }|r|                    |j                   ]nt          |          d
k    rt!          j        d| d           	 t%          |           }n1# t&          j        $ r}t!          j        |           Y d}~dS d}~ww xY wg }|D ]w}	t+          |	j                                      d          r|                    d           ?t          |	j        dd	                   }|r|                    |j                   x|D ]}||vr|                    |           |S )z
    Detects the list of languages present in the text (in the default "auto" mode),
    or formats and passes through the user inputted document languages if provided.
    zOThe language parameter must be a list of language codes as strings, ex. ["eng"]r   r   Nz^[\x00-\x7F]+$   r)   r   r      z9Since "auto" is present in the input languages provided (z]), the language will be auto detected and the rest of the inputted languages will be ignored.zhzho)r   r   r   striprer  r   r   r   seedr   r   r   r   appendr   r   r   r   r	   LangDetectExceptionr   r   r
  )
r  r   doc_languagesr   str_langlanguagelangdetect_resultelangdetect_langslangobjs
             r   detect_languagesr  |  s`    i&& 
]
 
 	
 |rTZZ\\R//t 
x!4(( S->->-B-Bw O!M
  (+V9,, 	5 	5D48tLLH28BQB<@@H 5$$X^444		5 y>>AN-I - - -  	 ,T 2 2$8 	 	 	N144444	 ') ) 	< 	<G7<  ++D11 < ''....6w|BQB7GHH <$++HN;;; % 	+ 	+D=(($$T***s   )D9 9E'E""E'FelementsIterable[Element]detect_language_per_elementboolIterator[Element]c              #    K   |dg}|dgk    r
| E d{V  dS t          | t                    st          |           } d                    d | D                       }t          ||          }|.t	          |          dk    r|du r| D ]}||j        _        |V  dS | D ]9}t          |d	          r#t          |j                  |j        _        |V  5|V  :dS )
zDetect language and apply it to metadata.languages for each element in `elements`.
    If languages is None, default to auto detection.
    If languages is and empty string, skip.Nr   r    c              3  D   K   | ]}t          |d           |j        V  dS )r  N)hasattrr  )r   r  s     r   r   z&apply_lang_metadata.<locals>.<genexpr>  s3      HHAWQ5G5GHHHHHHHr   )r  r   r  Fr  )	r   r   r   r  r   metadatar   r'  r  )r  r   r!  	full_textdetected_languagesr  s         r   apply_lang_metadatar+    s<      H	 RD h%% ">>HHHHHHHI)yINNN&	NNa'500  	 	A#5AJ GGGG	 	  	 	Aq&!! '7'?'?
$	 	r   list[str] | strc                    t          | t                    rd                    |           } t          j        dd|           } t          j        dd|           } | S )zFix common incorrect definitions for ocr_languages:
    defining it as a list, adding extra quotation marks, adding brackets.
    Returns a single string of ocr_languages+z[\"']r   z[\[\]])r   r   r   r  subr   s    r   r   r     sU    
 -&& 0// F8R77MF9b-88Mr   )r   r   r   r   )r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r  )r  r   r   r   )r  r   r   r   r   r   )F)r  r   r   r   r!  r"  r   r#  )r   r,  r   r   )
__future__r   r  typingr   r   r   r  
langdetectr   r   r	   unstructured.documents.elementsr
   unstructured.loggerr   &unstructured.partition.utils.constantsr   r   r   r   r   r   r   r   r   r   r   r  r+  r   r   r   r   <module>r6     s   " " " " " " 				 / / / / / / / / / /           4 3 3 3 3 3 & & & & & &          B7'	47'	47' 
47' 
4	7'
 
47' 
47' 
47' t7' }7' 
47' 
47' 
87' 
47' 
47' 
47'  
4!7'" 
4#7' 7'$ 
4%7'& 
4'7'( 
4)7'* 
4+7', 
4-7'. 
4/7'0 
717'2 
837'4 
457'6 
:77'8 
497': 
4;7'< 
4=7'> 
4?7'@ 
4A7'B 
4C7'D 
4E7' 7' 7'F 
4G7'H 
4I7'J 
4K7'L 
4M7'N 
4O7'P 
4Q7'R 
4S7'T 
4U7'V 
4W7'X 
=Y7'Z 
4[7'\ 
4]7'^ 
4_7'` 
4a7'b 
4c7'd 
4e7'f 
4g7' 7'h m7' 7' 7' #t GLW B B B B B2   &< < < <~= = = =% % % %P   \ \ \ \ '-XL L L L Ld ).- - - - -`     r   