
    Ng              	           d dl Z d dlmZmZ d dlZd dlmZmZ d dlm	Z	 d dl
mZ dedefdZd	efd
Zddedee         dedefdZd ZdS )    N)ListOptional)MarianMTModelMarianTokenizer)sent_tokenize)chunk_by_attention_windowsource_langtarget_langc                     d|  d| S )zjConstructs the name of the MarianMT machine translation model based on the
    source and target language.zHelsinki-NLP/opus-mt-- )r	   r
   s     [/var/www/html/ai-engine/env/lib/python3.11/site-packages/unstructured/cleaners/translate.py_get_opus_mt_model_namer      s     ?;>>>>>    language_codec                 |    t          | t                    rt          |           dk    rt          d|  d          d S )N   zInvalid language code: z,. Language codes must be two letter strings.)
isinstancestrlen
ValueError)r   s    r   _validate_language_coder      sO    mS)) 
S-?-?1-D-Damaaa
 
 	
 .E-Dr   entextreturnc                 D   |                                  dk    r| S ||nt          j        |           }|                    d          rd}t	          |           t	          |           ||k    r| S t          ||          }	 t          j        |          }t          j        |          }n!# t          $ r t          d| d          w xY wt          | |t                    }g }|D ]&}	|                    t          | ||                     'd                    |          S )a  Translates the foreign language text. If the source language is not specified, the
    function will attempt to detect it using langdetect.

    Parameters
    ----------
    text: str
        The text to translate
    target_lang: str
        The two letter language code for the target langague. Defaults to "en".
    source_lang: Optional[str]
        The two letter language code for the language of the input text. If source_lang is
        not provided, the function will try to detect it.
     Nzhz2Transformers could not find the translation model z>. The requested source/target language combo is not supported.)split_function )strip
langdetectdetect
startswithr   r   r   from_pretrainedr   OSErrorr   r   r   append_translate_textjoin)
r   r	   r
   _source_lang
model_name	tokenizermodelchunkstranslated_chunkschunks
             r   translate_textr1      s`    zz||r'2'>JDUVZD[D[L t$$ K(((L)))l""({CCJ
#3J??	-j99 
 
 
K K K K
 
 	

 2$	R_```F#% J J  ui!H!HIIII88%&&&s   (B* *Cc                     t          j                    5  t          j        d            |j        d	i  | gddd          }ddd           n# 1 swxY w Y   fd|D             d         S )
z8Translates text using the specified model and tokenizer.ignorept
max_length   )return_tensorspaddingr5   Nc                 @    g | ]}                     |d d          S )r6   T)max_new_tokensskip_special_tokens)decode).0tr,   s     r   
<listcomp>z#_translate_text.<locals>.<listcomp>T   s/    bbbRSIQsMMbbbr   r   r   )warningscatch_warningssimplefiltergenerate)r   r-   r,   
translateds     ` r   r(   r(   J   s    
 
	 	"	" 
 
h'''#U^ 
 
it\VYZZZ
 


 
 
 
 
 
 
 
 
 
 
 
 
 
 

 cbbbWabbb	 s   0AAA)Nr   )r@   typingr   r   r"   transformersr   r   unstructured.nlp.tokenizer    unstructured.staging.huggingfacer   r   r   r   r1   r(   r   r   r   <module>rI      s     ! ! ! ! ! ! ! !     7 7 7 7 7 7 7 7 3 3 3 3 3 3 F F F F F F? ?3 ? ? ? ?
3 
 
 
 
/' /' /'8C= /'c /']` /' /' /' /'d    r   