
    g7                         d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
ddlmZmZmZ ddlmZ  ej        e          ZdZdd	iZg d
Z G d de          ZdS )    N)copyfile)AnyDictListOptionalTuple   )
AddedTokenBatchEncodingPreTrainedTokenizer)loggingu   ▁
vocab_filezsentencepiece.bpe.model)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                       e Zd ZU dZeZddgZg Zee	         e
d<   g Zee	         e
d<   	 	 	 	 	 	 	 	 	 	 	 	 d3deeeef                  f fdZd Zd Zed             Zedefd            Zej        deddfd            Z	 d4dee	         deee	                  dedee	         f fdZ	 d5dee	         deee	                  dee	         fdZ	 d5dee	         deee	                  dee	         fdZdedee         dee         fdZd  Zd!edee         fd"Zd# Zd$ Zd% Z d5d&ed'ee         de!e         fd(Z"	 	 	 d6d+ee         ded,eee                  dede#f
 fd-Z$d. Z%d/ Z&d7d0Z'd1eddfd2Z( xZ)S )8MBartTokenizeruT  
    Construct an MBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizer

    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokens<s></s><unk><pad><mask>Nsp_model_kwargsc                 f    t          |t                    rt          |dd          n|}|i n| _        t	          j        di  j         _         j                            t          |                     | _        ddddd _	        d _
        t           j                   _         fd	t          t                    D              _        d
  j                                        D              _        t           j                  t           j                  z    j
        z    j	        d<    j	                             j                   d  j	                                        D              _        t)           j                                                  |!                    fd|D                         t/                      j        d|||||||d |
| j        d| |
|
nd _         j         j                  _        | _                              j                   d S )NTF)lstrip
normalizedr         r	   )r.   r1   r/   r0   c                 <    i | ]\  }}|j         |z   j        z   S  )sp_model_sizefairseq_offset).0icodeselfs      h/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/mbart/tokenization_mbart.py
<dictcomp>z+MBartTokenizer.__init__.<locals>.<dictcomp>h   s<      
  
  
CJ1dD$$q(4+>> 
  
  
    c                     i | ]\  }}||	S r:   r:   r=   kvs      rA   rB   z+MBartTokenizer.__init__.<locals>.<dictcomp>k   s    NNNA1NNNrC   r2   c                     i | ]\  }}||	S r:   r:   rE   s      rA   rB   z+MBartTokenizer.__init__.<locals>.<dictcomp>o   s    %Z%Z%Ztq!a%Z%Z%ZrC   c                     g | ]}|v|	S r:   r:   )r=   t_additional_special_tokenss     rA   
<listcomp>z+MBartTokenizer.__init__.<locals>.<listcomp>u   s$    ]]]qB\9\9\9\9\9\rC   )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokentokenizer_filesrc_langtgt_langadditional_special_tokensr3   r   r:   )
isinstancestrr
   r3   spmSentencePieceProcessorsp_modelLoadr   fairseq_tokens_to_idsr<   lenr;   	enumerateFAIRSEQ_LANGUAGE_CODESlang_code_to_iditemsid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextendsuper__init__	_src_langcur_lang_code_idrV   set_src_lang_special_tokens)r@   r   rM   rN   rP   rQ   rO   rR   rS   rT   rU   rV   r3   rW   kwargsrK   	__class__s   `              @rA   rk   zMBartTokenizer.__init__?   sx   & FPPZ\_E`E`pJz$5AAAAfp 	 &5%<rr/2JJT5IJJ3z??+++$ ./APQ%R%R"   // 
  
  
  
NWXnNoNo 
  
  
  ON1E1K1K1M1MNNN/24=/A/ACH\D]D]/]`d`s/s"8,"))$*>???%Z%Zt7Q7W7W7Y7Y%Z%Z%Z"%)$*>*C*C*E*E%F%F"$0&--]]]]5]]]   	 	
!&@ 0	
 	
 	
 	
 	
  &.%9w $ 4T^ D ((88888rC   c                 z    | j                                         }d |d<   | j                                        |d<   |S )Nr\   sp_model_proto)__dict__copyr\   serialized_model_proto)r@   states     rA   __getstate__zMBartTokenizer.__getstate__   s=    ""$$ j"&-"F"F"H"HrC   c                     || _         t          | d          si | _        t          j        di | j        | _        | j                            | j                   d S )Nr3   r:   )rs   hasattrr3   rZ   r[   r\   LoadFromSerializedProtorr   )r@   ds     rA   __setstate__zMBartTokenizer.__setstate__   s`     t.// 	&#%D 2JJT5IJJ--d.ABBBBBrC   c                 j    t          | j                  t          | j                  z   | j        z   dz   S )Nr7   )r_   r\   rb   r<   r@   s    rA   
vocab_sizezMBartTokenizer.vocab_size   s0    4=!!C(<$=$==@SSVWWWrC   returnc                     | j         S N)rl   r~   s    rA   rU   zMBartTokenizer.src_lang   s
    ~rC   new_src_langc                 H    || _         |                     | j                    d S r   )rl   rn   )r@   r   s     rA   rU   zMBartTokenizer.src_lang   s%    %((88888rC   Ftoken_ids_0token_ids_1already_has_special_tokensc                 @   |r$t                                          ||d          S dgt          | j                  z  }dgt          | j                  z  }||dgt          |          z  z   |z   S |dgt          |          z  z   dgt          |          z  z   |z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   r7   Nr   )rj   get_special_tokens_maskr_   r,   r-   )r@   r   r   r   prefix_onessuffix_onesrp   s         rA   r   z&MBartTokenizer.get_special_tokens_mask   s    & & 	7722'[]a 3    cC 2333cC 23331#K(8(8"89KGGqcC$4$445!s;?O?O9OPS^^^rC   c                 T    || j         |z   | j        z   S | j         |z   |z   | j        z   S )ab  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r,   r-   )r@   r   r   s      rA    build_inputs_with_special_tokensz/MBartTokenizer.build_inputs_with_special_tokens   s;    , %3d6HHH!K/+=@RRRrC   c                     | j         g}| j        g}|t          ||z   |z             dgz  S t          ||z   |z   |z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        Nr   )sep_token_idcls_token_idr_   )r@   r   r   sepclss        rA   $create_token_type_ids_from_sequencesz3MBartTokenizer.create_token_type_ids_from_sequences   sm    $  ! !s[(3.//1#553$s*S0;>DEEKKrC   return_tensorsrU   rV   c                     ||t          d          || _         | |fd|d|}|                     |          }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrU   convert_tokens_to_ids)r@   
raw_inputsr   rU   rV   extra_kwargsinputstgt_lang_ids           rA   _build_translation_inputsz(MBartTokenizer._build_translation_inputs   sg     x/`aaa jiT.ii\hii00::(3$%rC   c                 |      fdt           j                  D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r:   )convert_ids_to_tokens)r=   r>   r@   s     rA   rB   z,MBartTokenizer.get_vocab.<locals>.<dictcomp>	  s)    RRRa++A..RRRrC   )ranger   re   added_tokens_encoder)r@   vocabs   ` rA   	get_vocabzMBartTokenizer.get_vocab  s@    RRRR5;Q;QRRRT.///rC   textc                 D    | j                             |t                    S )N)out_type)r\   encoderY   )r@   r   s     rA   	_tokenizezMBartTokenizer._tokenize  s    }##D3#777rC   c                     || j         v r| j         |         S | j                            |          }|r
|| j        z   n| j        S )z0Converts a token (str) in an id using the vocab.)r^   r\   	PieceToIdr<   unk_token_id)r@   tokenspm_ids      rA   _convert_token_to_idz#MBartTokenizer._convert_token_to_id  sP    D...-e44((// 06Lv+++4;LLrC   c                 r    || j         v r| j         |         S | j                            || j        z
            S )z=Converts an index (integer) in a token (str) using the vocab.)rf   r\   	IdToPiecer<   )r@   indexs     rA   _convert_id_to_tokenz#MBartTokenizer._convert_id_to_token  s<    D...-e44}&&ut/B'BCCCrC   c                     d                     |                              t          d                                          }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r@   tokens
out_strings      rA   convert_tokens_to_stringz'MBartTokenizer.convert_tokens_to_string  s4    WWV__,,-=sCCIIKK
rC   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    r:t           j        	                    | j                  rt          | j        |           nzt           j        	                    | j                  sVt          |d          5 }| j                                        }|                    |           d d d            n# 1 swxY w Y   |fS )NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openr\   ru   write)r@   r   r   out_vocab_fileficontent_spiece_models         rA   save_vocabularyzMBartTokenizer.save_vocabulary$  sw   w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNSUSZSaSabfbqSrSrNT_n555500 	/nd++ /r'+}'K'K'M'M$-.../ / / / / / / / / / / / / / /   s   (/E##E'*E'r   r"   	src_texts	tgt_textsc                 V    || _         || _         t                      j        ||fi |S r   )rU   rV   rj   prepare_seq2seq_batch)r@   r   rU   r   rV   ro   rp   s         rA   r   z$MBartTokenizer.prepare_seq2seq_batch5  s4     ! ,uww,Y	LLVLLLrC   c                 6    |                      | j                  S r   )rn   rU   r~   s    rA   _switch_to_input_modez$MBartTokenizer._switch_to_input_modeA      //>>>rC   c                 6    |                      | j                  S r   )set_tgt_lang_special_tokensrV   r~   s    rA   _switch_to_target_modez%MBartTokenizer._switch_to_target_modeD  r   rC   c                 ^    | j         |         | _        g | _        | j        | j        g| _        dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].Nrb   cur_lang_coder,   eos_token_idr-   )r@   rU   s     rA   rn   z*MBartTokenizer.set_src_lang_special_tokensG  s3    !1(;"/1CDrC   langc                 ^    | j         |         | _        g | _        | j        | j        g| _        dS )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   )r@   r   s     rA   r   z*MBartTokenizer.set_tgt_lang_special_tokensM  s3    !1$7"/1CDrC   )r.   r/   r/   r.   r0   r1   r2   NNNNN)NFr   )r   Nr"   )r   N)*__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr,   r   int__annotations__r-   r   r   rY   r   rk   rw   r|   propertyr   rU   setterboolr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rn   r   __classcell__)rp   s   @rA   r)   r)   $   s         ( *$&67!M49!!!!M49!!!
 48"&L9 L9 "$sCx.1L9 L9 L9 L9 L9 L9\  C C C X X XX #    X _9S 9T 9 9 9 _9
 sx_ _9_3;DI3F_ko_	c_ _ _ _ _ _> JNS S9S3;DI3FS	cS S S S8 JNL L9L3;DI3FL	cL L L L2
*-
9A#
RZ[^R_
 
 
 
  
8c 8d3i 8 8 8 8M M MD D D  
! !c !HSM !]bcf]g ! ! ! !(  )-
M 
M9
M 
M DI&	
M
 
M 

M 
M 
M 
M 
M 
M? ? ?? ? ?E E E EE E E E E E E E E ErC   r)   )r   shutilr   typingr   r   r   r   r   sentencepiecerZ   tokenization_utilsr
   r   r   utilsr   
get_loggerr   r   r   r   ra   r)   r:   rC   rA   <module>r      s    
			       3 3 3 3 3 3 3 3 3 3 3 3 3 3     P P P P P P P P P P       
	H	%	% !#<=  {  {  { mE mE mE mE mE( mE mE mE mE mErC   