
    gI                         d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
ddlmZmZmZ ddlmZ  ej        e          ZdZdd	d
Zg dg ddZddddddddZ G d de          ZdS )    N)copyfile)AnyDictListOptionalTuple   )
AddedTokenBatchEncodingPreTrainedTokenizer)loggingu   ▁zsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)__java__
__python__	__en_XX__)r   r   r   __javascript____php____ruby____go__)basemultir   r   r   r   r   r   r   )javapythonen_XX
javascriptphprubygoc                       e Zd ZU dZeZddgZg Zee	         e
d<   g Zee	         e
d<   	 	 	 	 	 	 	 	 	 	 	 	 	 	 d6deeeef                  f fdZd Zd Zed             Zedefd            Zej        deddfd            Z	 d7dee	         deee	                  dedee	         f fdZ	 d8dee	         deee	                  dee	         fdZ	 d8dee	         deee	                  dee	         fdZdedee         d ee         fd!Zd" Zd#edee         fd$Zd% Zd& Zd' Z d8d(ed)ee         de!e         fd*Z"	 	 	 d9d-ee         ded.eee                  d ede#f
 fd/Z$d0 Z%d1 Z&d:d2Z'd3eddfd4Z(d3edefd5Z) xZ*S );PLBartTokenizera  
    Construct an PLBART tokenizer.

    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        src_lang (`str`, *optional*):
            A string representing the source language.
        tgt_lang (`str`, *optional*):
            A string representing the target language.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The start of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The cls token, which is a special token used as the first token for all tasks.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token(`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masking tasks. This
            is only used in the `"base"` tokenizer type. For `"multi"` tokenizer, masking is never done for the
            downstream tasks.
        language_codes (`str`, *optional*, defaults to `"base"`):
            What language codes to use. Should be one of `"base"` or `"multi"`.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:
            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.
            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import PLBartTokenizer

    >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
    >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
    >>> expected_translation_english = "Returns the maximum value of a b c."
    >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
    ```	input_idsattention_maskprefix_tokenssuffix_tokens<s></s><unk><pad><mask>r   NTsp_model_kwargsc                 l    t          |t                    rt          |dd          n|}|i n| _                             |          }                     |          }t          j        di  j         _         j                            t          |                     | _	        |	 _
        t           j
                 }ddddd _        d _        t           j                   _         fd	t!          |          D              _        d
  j                                        D              _         j
        dk    r9t           j                  t           j                  z    j        z    j        d<    j                             j                   d  j                                        D              _        t-           j                                                  |!                    fd|D                         j
        dk    r-| _         j         j         j                 n j         _        n"||nd _         j         j                  _         t7                      j        d||||||||	|
|| j        |d| | _                              j                   d S )NTF)lstriprstripr         r	   )r&   r)   r'   r(   c                 <    i | ]\  }}|j         |z   j        z   S  )sp_model_sizefairseq_offset).0icodeselfs      j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/plbart/tokenization_plbart.py
<dictcomp>z,PLBartTokenizer.__init__.<locals>.<dictcomp>   s<      
  
  
CJ1dD$$q(4+>> 
  
  
    c                     i | ]\  }}||	S r2   r2   r5   kvs      r9   r:   z,PLBartTokenizer.__init__.<locals>.<dictcomp>   s    NNNA1NNNr;   r   r*   c                     i | ]\  }}||	S r2   r2   r=   s      r9   r:   z,PLBartTokenizer.__init__.<locals>.<dictcomp>   s    %Z%Z%Ztq!a%Z%Z%Zr;   c                     g | ]}|v|	S r2   r2   )r5   t_additional_special_tokenss     r9   
<listcomp>z,PLBartTokenizer.__init__.<locals>.<listcomp>   s$    ]]]qB\9\9\9\9\9\r;   r   )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokenlanguage_codesr   src_langtgt_langadditional_special_tokensr+   clean_up_tokenization_spacesr2   )
isinstancestrr
   r+   !_convert_lang_code_special_formatspmSentencePieceProcessorsp_modelLoadr   rL   FAIRSEQ_LANGUAGE_CODESfairseq_tokens_to_idsr4   lenr3   	enumeratelang_code_to_iditemsid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextend	_src_langcur_lang_code_idsuper__init__rN   set_src_lang_special_tokens)r8   r   rE   rF   rH   rI   rG   rJ   rK   rL   r   rM   rN   r+   rO   rP   kwargsfairseq_language_codesrC   	__class__s   `                 @r9   rg   zPLBartTokenizer.__init__u   s   ( KUU_adJeJeuZ
4FFFFku
%4%<rr/99(CC99(CC2JJT5IJJ3z??+++$,!78K!L ./APQ%R%R"   // 
  
  
  
NWXnNoNo 
  
  
  ON1E1K1K1M1MNNN&((36t}3E3EDL`HaHa3adhdw3wD&x0"))$*>???%Z%Zt7Q7W7W7Y7Y%Z%Z%Z"%)$*>*C*C*E*E%F%F"$0&--]]]]5]]]   &((%DN8<8R$T^44X\Xf !! *2)=XX;DN$($8$HD! 	
!))&@ 0)E	
 	
 	
 	
 	
$ !((88888r;   c                 z    | j                                         }d |d<   | j                                        |d<   |S )NrV   sp_model_proto)__dict__copyrV   serialized_model_proto)r8   states     r9   __getstate__zPLBartTokenizer.__getstate__   s=    ""$$ j"&-"F"F"H"Hr;   c                     || _         t          | d          si | _        t          j        di | j        | _        | j                            | j                   d S )Nr+   r2   )rn   hasattrr+   rT   rU   rV   LoadFromSerializedProtorm   )r8   ds     r9   __setstate__zPLBartTokenizer.__setstate__   s`     t.// 	&#%D 2JJT5IJJ--d.ABBBBBr;   c                     | j         dk    r4t          | j                  t          | j                  z   | j        z   dz   S t          | j                  t          | j                  z   | j        z   S )Nr   r/   )rL   rZ   rV   r\   r4   r8   s    r9   
vocab_sizezPLBartTokenizer.vocab_size   sj    &((DM""S)=%>%>>ATTWXX t}%%D,@(A(AADDWWWr;   returnc                     | j         S N)rd   ry   s    r9   rM   zPLBartTokenizer.src_lang   s
    ~r;   new_src_langc                 r    |                      |          }|| _        |                     | j                   d S r}   )rS   rd   rh   )r8   r~   s     r9   rM   zPLBartTokenizer.src_lang   s8    ==lKK%((88888r;   Ftoken_ids_0token_ids_1already_has_special_tokensc                 @   |r$t                                          ||d          S dgt          | j                  z  }dgt          | j                  z  }||dgt          |          z  z   |z   S |dgt          |          z  z   dgt          |          z  z   |z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   r/   Nr   )rf   get_special_tokens_maskrZ   r$   r%   )r8   r   r   r   prefix_onessuffix_onesrk   s         r9   r   z'PLBartTokenizer.get_special_tokens_mask   s    & & 	7722'[]a 3    cC 2333cC 23331#K(8(8"89KGGqcC$4$445!s;?O?O9OPS^^^r;   c                 T    || j         |z   | j        z   S | j         |z   |z   | j        z   S )ac  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An PLBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r$   r%   )r8   r   r   s      r9    build_inputs_with_special_tokensz0PLBartTokenizer.build_inputs_with_special_tokens  s;    , %3d6HHH!K/+=@RRRr;   c                     | j         g}| j        g}|t          ||z   |z             dgz  S t          ||z   |z   |z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PLBart does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )sep_token_idcls_token_idrZ   )r8   r   r   sepclss        r9   $create_token_type_ids_from_sequencesz4PLBartTokenizer.create_token_type_ids_from_sequences/  sm    "  ! !s[(3.//1#553$s*S0;>DEEKKr;   return_tensorsrM   rN   c                     ||t          d          |                     |          | _        |                     |          | _         | |fd|d|}|                     | j                  }||d<   |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrS   rM   rN   convert_tokens_to_ids)r8   
raw_inputsr   rM   rN   extra_kwargsinputstgt_lang_ids           r9   _build_translation_inputsz)PLBartTokenizer._build_translation_inputsG  s     x/`aaa>>xHH>>xHHjiT.ii\hii00??(3$%r;   c                 |      fdt           j                  D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r2   )convert_ids_to_tokens)r5   r6   r8   s     r9   r:   z-PLBartTokenizer.get_vocab.<locals>.<dictcomp>U  s)    RRRa++A..RRRr;   )rangerz   r_   added_tokens_encoder)r8   vocabs   ` r9   	get_vocabzPLBartTokenizer.get_vocabT  s@    RRRR5;Q;QRRRT.///r;   textc                 D    | j                             |t                    S )N)out_type)rV   encoderR   )r8   r   s     r9   	_tokenizezPLBartTokenizer._tokenizeY  s    }##D3#777r;   c                     || j         v r| j         |         S | j                            |          }|r
|| j        z   n| j        S )z0Converts a token (str) in an id using the vocab.)rY   rV   	PieceToIdr4   unk_token_id)r8   tokenspm_ids      r9   _convert_token_to_idz$PLBartTokenizer._convert_token_to_id\  sP    D...-e44((// 06Lv+++4;LLr;   c                 r    || j         v r| j         |         S | j                            || j        z
            S )z=Converts an index (integer) in a token (str) using the vocab.)r`   rV   	IdToPiecer4   )r8   indexs     r9   _convert_id_to_tokenz$PLBartTokenizer._convert_id_to_tokene  s<    D...-e44}&&ut/B'BCCCr;   c                     d                     |                              t          d                                          }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r8   tokens
out_strings      r9   convert_tokens_to_stringz(PLBartTokenizer.convert_tokens_to_stringk  s4    WWV__,,-=sCCIIKK
r;   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    r:t           j        	                    | j                  rt          | j        |           nzt           j        	                    | j                  sVt          |d          5 }| j                                        }|                    |           d d d            n# 1 swxY w Y   |fS )NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openrV   rp   write)r8   r   r   out_vocab_fileficontent_spiece_models         r9   save_vocabularyzPLBartTokenizer.save_vocabularyp  sw   w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNSUSZSaSabfbqSrSrNT_n555500 	/nd++ /r'+}'K'K'M'M$-.../ / / / / / / / / / / / / / /   s   (/E##E'*E'r   r   	src_texts	tgt_textsc                     |                      |          | _        |                      |          | _         t                      j        ||fi |S r}   )rS   rM   rN   rf   prepare_seq2seq_batch)r8   r   rM   r   rN   ri   rk   s         r9   r   z%PLBartTokenizer.prepare_seq2seq_batch  sP     >>xHH>>xHH,uww,Y	LLVLLLr;   c                 6    |                      | j                  S r}   )rh   rM   ry   s    r9   _switch_to_input_modez%PLBartTokenizer._switch_to_input_mode      //>>>r;   c                 6    |                      | j                  S r}   )set_tgt_lang_special_tokensrN   ry   s    r9   _switch_to_target_modez&PLBartTokenizer._switch_to_target_mode  r   r;   c                     |                      |          }|| j        |         nd| _        g | _        | j        | j        | j        g| _        dS | j        g| _        dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrS   r\   cur_lang_coder$   eos_token_idr%   )r8   rM   s     r9   rh   z+PLBartTokenizer.set_src_lang_special_tokens  sn    99(CC?G?ST1(;;Y])"&"3T5G!HD"&"3!4Dr;   langc                     |                      |          }|| j        |         nd| _        g | _        | j        | j        | j        g| _        dS | j        g| _        dS )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   r8   r   s     r9   r   z+PLBartTokenizer.set_tgt_lang_special_tokens  sn    55d;;;?;KT1$77QU)"&"3T5G!HD"&"3!4Dr;   c                 Z    |t                                           v rt           |         n|}|S )z;Convert Language Codes to format tokenizer uses if required)FAIRSEQ_LANGUAGE_CODES_MAPrb   r   s     r9   rS   z1PLBartTokenizer._convert_lang_code_special_format  s/    37;U;Z;Z;\;\3\3\)$//bfr;   )r&   r'   r'   r&   r(   r)   r*   r   NNNNNT)NFr}   )r   Nr   )r{   N)+__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr$   r   int__annotations__r%   r   r   rR   r   rg   rr   rw   propertyrz   rM   setterboolr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rh   r   rS   __classcell__)rk   s   @r9   r!   r!   1   s9        ; ;z *$&67!M49!!!!M49!!!
 48"&%)!\9 \9 "$sCx.1\9 \9 \9 \9 \9 \9|  C C C X X XX #    X _9S 9T 9 9 9 _9 sx_ _9_3;DI3F_ko_	c_ _ _ _ _ _> JNS S9S3;DI3FS	cS S S S8 JNL L9L3;DI3FL	cL L L L0*-9A#RZ[^R_     
8c 8d3i 8 8 8 8M M MD D D  
! !c !HSM !]bcf]g ! ! ! !(  )- 
M 
M9
M 
M DI&	
M
 
M 

M 
M 
M 
M 
M 
M? ? ?? ? ?5 5 5 5	5 	5 	5 	5 	5 	5c c        r;   r!   )r   shutilr   typingr   r   r   r   r   sentencepiecerT   tokenization_utilsr
   r   r   utilsr   
get_loggerr   r   r   r   rX   r   r!   r2   r;   r9   <module>r      s5    
			       3 3 3 3 3 3 3 3 3 3 3 3 3 3     P P P P P P P P P P       
	H	%	% #<P`aa  433ggg   "
  z z z z z) z z z z zr;   