
    Ng#                       d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZmZmZ  ej        e          j                                        Zej                            eddd          Zd	 Zd
 Zd Zd Zd Zd Z dMdZ!dMdZ"d Z#dMdZ$ej%        &                    dg d          dNd            Z'dMdZ(ej%        &                    dg d          dNd            Z)d Z*d Z+d  Z,d! Z-dMd"Z.d# Z/dMd$Z0ej%        &                    d%d&d'g          dOd)            Z1ej%        &                    d*d+gd+fd,d-d.d/gd0fd1gd0fd2g          dPd5            Z2d6 Z3ej%        &                    d7d8gd0d8gfd9gd:d8gfd8gd/d8gfd8gd1d8gfd8gd0gd8gfd8gd/gd8gfd;gd<d8d0gfg          dQdB            Z4ej%        &                    d7g d0d0gfdCgd1d0gfdCgd0d0gfdCgd/d0gfg          dRdD            Z5ej%        &                    dEg dfdCgdfg          dSdG            Z6dH Z7ej%        &                    dEg dIgfdCgdJfg          dTdL            Z8dS )Uz=Unit-test suite for the `unstructured.partition.lang` module.    )annotationsN)LogCaptureFixture)NarrativeText	PageBreak)_clean_ocr_languages_arg/_convert_language_code_to_pytesseract_lang_codeapply_lang_metadatacheck_language_argsdetect_languagesprepare_languages_for_tesseracttesseract_to_paddle_languagez..zexample-docsc                 6    dg} t          |           dk    sJ d S Nenengr   	languagess    h/var/www/html/ai-engine/env/lib/python3.11/site-packages/test_unstructured/partition/common/test_lang.py6test_prepare_languages_for_tesseract_with_one_languager      s(    I*955>>>>>>    c                 8    ddg} t          |           dk    sJ d S r   r   r   s    r   >test_prepare_languages_for_tesseract_with_duplicated_languagesr   $   s*    uI*955>>>>>>r   c                 f    dg} t          |           dk    sJ dg} t          |           dk    sJ d S )Nosdequr   r   s    r   1test_prepare_languages_for_tesseract_special_caser   )   sH    I*955>>>>I*955>>>>>>r   c                 8    ddg} t          |           dk    sJ d S )Nkbdeszspa+spa_oldr   r   s    r   9test_prepare_languages_for_tesseract_removes_empty_inputsr!   1   s*    I*955FFFFFFr   c                 6    dg} t          |           dk    sJ d S )Nchi)chi_sim+chi_sim_vert+chi_tra+chi_tra_vertr   r   s    r   6test_prepare_languages_for_tesseract_includes_variantsr%   6   s)    I*9559dddddddr   c                 8    g d} t          |           dk    sJ d S )N)jaafrr   r   zjpn+jpn_vert+afr+eng+equr   r   s    r   <test_prepare_languages_for_tesseract_with_multiple_languagesr)   ;   s-    ***I*9559SSSSSSSr   caplogr   c                N    ddg}t          |          dk    sJ d| j        v sJ d S )Nzzzr#   r$   z"not a valid standard language coder   textr*   r   s     r   ?test_prepare_languages_for_tesseract_warns_nonstandard_languager0   @   s>    I*9559ddddd/6;>>>>>>r   c                N    ddg}t          |          dk    sJ d| j        v sJ d S )Nr   r   z%not a language supported by Tesseractr-   r/   s     r   Atest_prepare_languages_for_tesseract_warns_non_tesseract_languager2   F   s=    I*955>>>>2fkAAAAAAr   c                     t          j        t          d          5  d } t          |            d d d            d S # 1 swxY w Y   d S )Nz`languages` can not be `None`)match)pytestraises
ValueErrorr   r   s    r   3test_prepare_languages_for_tesseract_None_languagesr8   L   s    	z)H	I	I	I 3 3	'	2223 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3s   ;??c                L    dg}t          |          dk    sJ d| j        v sJ d S )N r   z>Failed to find any valid standard language code from languagesr-   r/   s     r   7test_prepare_languages_for_tesseract_no_valid_languagesr;   R   s;    I*955>>>>Kv{ZZZZZZr   tesseract_langexpected_lang))r   r   )chi_simch)chi_trachinese_cht)deugerman)jpnjapan)korkoreanr=   strr>   c                0    |t          |           k    sJ d S Nr   r<   s     r   -test_tesseract_to_paddle_language_valid_codesrM   X   s$     8HHHHHHHHr   c                J    d}t          |          dk    sJ d| j        v sJ d S )Nunsupported_langr   z?unsupported_lang is not a language code supported by PaddleOCR,)r   r.   )r*   r=   s     r   /test_tesseract_to_paddle_language_invalid_codesrP   g   s;    'N'774????LPVP[[[[[[[r   ))ENGr   )Frafr)DEUrD   c                0    |t          |           k    sJ d S rK   rL   r<   s     r   2test_tesseract_to_paddle_language_case_sensitivityrV   m   s$     8HHHHHHHHr   c                 6    d} t          |           dgk    sJ d S )NThis is a short sentence.r   r   r.   s    r   "test_detect_languages_english_autor[   y   s(    &DD!!eW,,,,,,r   c                 >    d} dg}t          | |          dgk    sJ d S )NThis is another short sentence.r   r   rY   r.   r   s     r   &test_detect_languages_english_providedr_   ~   s1    ,DID),,777777r   c                 6    d} t          |           dgk    sJ d S )Nu   안녕하세요rG   rY   rZ   s    r   !test_detect_languages_korean_autora      s(    DD!!eW,,,,,,r   c                 8    d} t          |           g dk    sJ d S )NzMy lubimy mleko i chleb.)cespolslkrY   rZ   s    r   -test_detect_languages_gets_multiple_languagesrf      s-    %DD!!%:%:%:::::::r   c                V    d}g d}t          ||          dgk    sJ d| j        v sJ d S )Nr]   )r   autorusr   z.rest of the inputted languages will be ignored)r   r.   )r*   r.   r   s      r   4test_detect_languages_warns_for_auto_and_other_inputrj      sH    ,D%%%ID),,7777;v{JJJJJJr   c                     t          j        t                    5  d} t          | d          dgk     d d d            d S # 1 swxY w Y   d S )NrX   r   r   )r5   r6   	TypeErrorr   rZ   s    r   <test_detect_languages_raises_TypeError_for_invalid_languagesrm      s    	y	!	! ; ;*///E7::; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ; ;s   A  AAc                    t          d          t          d          g}t          t          |dgd                    }dd | j        D             vsJ d S )NzSample text.r:   rh   T)elementsr   detect_language_per_elementzNo features in text.c                    g | ]	}|j         
S  )message).0recs     r   
<listcomp>zItest_apply_lang_metadata_has_no_warning_for_PageBreak.<locals>.<listcomp>   s    )P)P)P##+)P)P)Pr   )r   r   listr	   records)r*   ro   s     r   5test_apply_lang_metadata_has_no_warning_for_PageBreakry      sp    n--y}}=Hh(,	
 	
 	
 H ")P)P)P)P)PPPPPPPr   lang_inr>   )r   r   )rS   frar{   c                0    |t          |           k    sJ d S rK   )r   rz   s     r   3test_convert_language_code_to_pytesseract_lang_coder~      s$     KGTTTTTTTTr   input_ocr_langsexpectedr   )"deu"rC   )[deu]rC   )z['deu']rC   r   rC   r   )deu+spar   r   r   c                0    t          |           |k    sJ d S rK   )r   r   s     r   test_clean_ocr_languages_argr      s#     $O44@@@@@@r   c                 <    t          ddg          } | dgk    sJ d S )Nz Sample text longer than 5 words.Spanishr^   sparY   r   s    r   3test_detect_languages_handles_spelled_out_languagesr      s2     &HU^T_```Ir   )r   ocr_languagesexpected_langsr   spanishenglishzspa+deuzeng+deur   	list[str]r   list[str] | strr   c                T    t          | |          }|D ]}||v sJ d|j        v sJ d S Nr   r   r   r
   r.   r   r   r   r*   returned_langslangs         r   Ytest_check_language_args_uses_languages_when_ocr_languages_and_languages_are_both_definedr      s`    $ )#  N  . .~%%%%&+-----. .r   r:   c                T    t          | |          }|D ]}||v sJ d|j        v sJ d S r   r   r   s         r   Ktest_check_language_args_uses_ocr_languages_when_languages_is_empty_or_Noner      sV      )9MZZZN . .~%%%%&+-----. .r   r   Nonec                0    t          | |          }|J d S Nr   r
   )r   r   r   s      r   %test_check_language_args_returns_Noner      s(     )9MZZZN!!!!!r   c                 :    t          g dd           dgk    sJ d S )N)r   r   rh   r   rh   r   rr   r   r   %test_check_language_args_returns_autor     s1    )?)?)?tTTTY_X```````r   rh   zeng+autostr | list[str]c                    t          j        t                    5  t          | |           d d d            d S # 1 swxY w Y   d S r   )r5   r6   r7   r
   r   s     r   Ftest_check_language_args_raises_error_when_ocr_languages_contains_autor     s     
z	"	" 
 
'	
 	
 	
 	

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
s   9= =)r*   r   )r=   rI   r>   rI   )r{   rI   r>   rI   )r   rI   r   rI   )r   r   r   r   r   r   r*   r   )r   r   r   rI   r   r   r*   r   )r   r   r   r   )r   r   r   r   )9__doc__
__future__r   ospathlibr5   test_unstructured.unit_utilsr   unstructured.documents.elementsr   r   "unstructured.partition.common.langr   r   r	   r
   r   r   r   Path__file__parentresolve	DIRECTORYpathjoinEXAMPLE_DOCS_DIRECTORYr   r   r   r!   r%   r)   r0   r2   r8   r;   markparametrizerM   rP   rV   r[   r_   ra   rf   rj   rm   ry   r~   r   r   r   r   r   r   r   rr   r   r   <module>r      s~   D C " " " " " " 				   : : : : : :                         GL"")1133	it^LL ? ? ?
? ? ?
? ? ?G G G
e e e
T T T
? ? ? ?B B B B3 3 3[ [ [ [ '  
 
I I I
 
I\ \ \ \ '   I I I I- - -
8 8 8- - -
; ; ;
K K K K; ; ;	Q 	Q 	Q 	Q   U U U U #
%
E
E A A A A     
 4
%%!
i%)
'E7#
'E7#
5'E7#
7)eW%
i%0 . . . . 4 
UUG
w 
uug
w 	 		. 	. 	.	 		. "	T

t " " " "a a a "	fX
z 
 
 
 
 
 
r   