
    gN                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZ ddlmZ erdd	lmZ dd
lmZ  ej        e          ZddiZdZ G d de          ZdS )z Tokenization class for model T5.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)logging
vocab_filezspiece.modelu   ▁c            
       <    e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 d)d
eee	e
f                  ddf fdZd*dZed             Zed             Zd Z	 d+dee         deee                  dedee         f fdZd Zd Zdee         dee         fdZ	 d,dee         deee                  dee         fdZ	 d,dee         deee                  dee         fdZd Zd Zdddee	         f fd Zed!             Zd" Zd# Z d$ Z!d% Z"d,d&e	d'ee	         de#e	         fd(Z$ xZ%S )-T5Tokenizera  
    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
           Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be
            retrieved by calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids
            method
         additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        legacy (`bool`, *optional*):
            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
            example:

            - `legacy=True`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
            >>> tokenizer.encode("Hello <extra_id_0>.")
            [8774, 32099, 3, 5, 1]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
            [8774, 32099, 5, 1]
            ```
            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_mask</s><unk><pad>d   NTsp_model_kwargsreturnc
                    t          |t                    rt          |d          n|}t          |t                    rt          |d          n|}t          |t                    rt          |d          n|}|i n|| _        || _        || _        t          j        di | j        | _        | j        	                    |           |ld |D             }t          |          dk     r|d t          |          D             z  }nK|dk    r)|t          |          k    rt          d| d| d	          nd
 t          |          D             }|}i | _        t          t          |                    D ]>}t          d| dddddd          | j        t          | j                  dz
  |z   |z
  <   ?|%t                              d| j         d           d}|| _        |                     |
                    dd                    | _        || _        || _        |	| _         t+                      j        d|||||| j        ||	d|
 d S )NT)specialc                 4    g | ]}d t          |          v |S )
<extra_id_)str).0xs     b/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/t5/tokenization_t5.py
<listcomp>z(T5Tokenizer.__init__.<locals>.<listcomp>   s,    [[[!LTWXYTZTZDZDZADZDZDZ       c                     g | ]}d | d	S r   > r    is     r"   r#   z(T5Tokenizer.__init__.<locals>.<listcomp>   s$    -Z-Z-ZA.?1.?.?.?-Z-Z-Zr$   r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensc                     g | ]}d | d	S r'   r)   r*   s     r"   r#   z(T5Tokenizer.__init__.<locals>.<listcomp>   s$    HHH!----HHHr$   r   r(   F)single_wordlstriprstripr   
normalizedz2You are using the default legacy behaviour of the a_  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565	from_slow)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   legacyadd_prefix_spacer)   )
isinstancer   r   r   r   
_extra_idsspmSentencePieceProcessorsp_modelLoadlenrange
ValueError_added_tokens_decoderloggerwarning_once	__class__r7   get_spm_processorpopr8   super__init__)selfr   r2   r3   r4   r5   r6   r   r7   r8   kwargsextra_tokensr+   rE   s                r"   rI   zT5Tokenizer.__init__   s    <FiQT;U;UdJy$7777[d	;EiQT;U;UdJy$7777[d	;EiQT;U;UdJy$7777[d	%4%<rr/$#2JJT5IJJ:&&&$0[['@[[[L<  1$$)-Z-ZyIYIY-Z-Z-ZZ))Q9L0A0A#A#A y  Sl      IHuY7G7GHHHL(4% &("s<(()) 	 	AQ[!Q!!!uT$X\inR R RD&s4='9'9A'=	'IA'MNN >JT^ J J J   F..vzz+u/M/MNN$# 0 
	
&? 0-
	
 
	
 
	
 
	
 
	
 
	
 
	
r$   Fc                 (   t          j        di | j        }| j        s|r|                    | j                   |S t          | j        d          5 }|                                }t          d| j	        j
         d          }|j                            |          }|                                }d|_        |j                            |           |                                }|                    |           d d d            n# 1 swxY w Y   |S )NrbzThe new behaviour of z (with `self.legacy = False`)Fr)   )r;   r<   r   r7   r>   r   openreadr   rE   __name__
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)rJ   r1   	tokenizerfr=   	model_pb2modelrV   s           r"   rF   zT5Tokenizer.get_spm_processor   sH   .FF1EFF	; 	) 	NN4?+++$/4(( 	8AvvxxH'(v@W(v(v(vwwI(33H==E'6688O/4O,!++O<<<..00H--h777	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 s   B*DDDc                     | t           j        v rEt           j        |          }|||k    r|S |'t          j        d| d|  d| d| d	t                     |S )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   max_model_input_sizeswarningswarnFutureWarning)pretrained_model_name_or_pathmax_model_lengthinit_max_model_lengthdeprecated_max_model_lengths       r"   !_eventually_correct_t5_max_lengthz-T5Tokenizer._eventually_correct_t5_max_length   s    (K,MMM*5*KLi*j'$05JN^5^5^,,&.g3g g 6	g g
 4g g %@g g g "    r$   c                 4    | j                                         S N)r=   get_piece_sizerJ   s    r"   
vocab_sizezT5Tokenizer.vocab_size   s    }++---r$   c                 |      fdt           j                  D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r)   )convert_ids_to_tokens)r    r+   rJ   s     r"   
<dictcomp>z)T5Tokenizer.get_vocab.<locals>.<dictcomp>   s)    RRRa++A..RRRr$   )r@   rl   updateadded_tokens_encoder)rJ   vocabs   ` r"   	get_vocabzT5Tokenizer.get_vocab   s@    RRRR5;Q;QRRRT.///r$   token_ids_0token_ids_1already_has_special_tokensc                     |r$t                                          ||d          S |dgt          |          z  dgz   S dgt          |          z  dgz   dgt          |          z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)ru   rv   rw   Nr   r%   )rH   get_special_tokens_maskr?   )rJ   ru   rv   rw   rE   s       r"   ry   z#T5Tokenizer.get_special_tokens_mask   s    $ & 	7722'[]a 3   
 C#k***qc11c+&&&1#-!s;7G7G1GHA3NNr$   c                 b    t          t          t          d | j                                      S )Nc                 J    t          t          j        d|                     d uS )Nz<extra_id_\d+>)boolresearch)r!   s    r"   <lambda>z1T5Tokenizer.get_sentinel_tokens.<locals>.<lambda>  s     bi0A1&E&E!F!Fd!R r$   )listsetfilterr6   rk   s    r"   get_sentinel_tokenszT5Tokenizer.get_sentinel_tokens  s1    RRTXTrsstt
 
 	
r$   c                 D      fd                                  D             S )Nc                 :    g | ]}                     |          S r)   )convert_tokens_to_ids)r    tokenrJ   s     r"   r#   z6T5Tokenizer.get_sentinel_token_ids.<locals>.<listcomp>  s'    ZZZe**511ZZZr$   )r   rk   s   `r"   get_sentinel_token_idsz"T5Tokenizer.get_sentinel_token_ids  s)    ZZZZt?W?W?Y?YZZZZr$   	token_idsc                     t          |          dk    r0|d         | j        k    rt          j        d| j         d           |S || j        gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r?   eos_token_idr`   ra   r2   )rJ   r   s     r"   _add_eos_if_not_presentz#T5Tokenizer._add_eos_if_not_present  si    y>>A)B-43D"D"DM+T^ + + +    1222r$   c                 z    | j         g}|t          ||z             dgz  S t          ||z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r   r?   )rJ   ru   rv   eoss       r"   $create_token_type_ids_from_sequencesz0T5Tokenizer.create_token_type_ids_from_sequences)  sS       !{S())QC//;${2S899QC??r$   c                 h    |                      |          }||S |                      |          }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r   )rJ   ru   rv   s      r"    build_inputs_with_special_tokensz,T5Tokenizer.build_inputs_with_special_tokens?  sA    & 22;??66{CCK,,r$   c                 B    | j                                         }d |d<   |S )Nr=   )__dict__copy)rJ   states     r"   __getstate__zT5Tokenizer.__getstate__Y  s$    ""$$ jr$   c                     || _         t          | d          si | _        t          j        di | j        | _        | j                            | j                   d S )Nr   r)   )r   hasattrr   r;   r<   r=   r>   r   )rJ   ds     r"   __setstate__zT5Tokenizer.__setstate__^  s_     t.// 	&#%D 2JJT5IJJ4?+++++r$   textr   c                 v   | j         st          |          dk    r t                      j        |fi |S |                    t
          d          }| j        r
t
          |z   } t                      j        |fi |}t          |          dk    r*|d         t
          k    r|d         | j        v r
|dd         }|S )z
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        r    r%   N)r7   r?   rH   tokenizereplaceSPIECE_UNDERLINEr8   all_special_tokens)rJ   r   rK   tokensrE   s       r"   r   zT5Tokenizer.tokenizeh  s    
 ; 	4#d))q..#577#D33F333||,c22  	+#d*D!!$11&11v;;??vay,<<<dNeAeAeABBZFr$   c                 t    t          | j                            t          | j                                      S ri   )r?   r=   encoder   r3   rk   s    r"   unk_token_lengthzT5Tokenizer.unk_token_lengthz  s*    4=''DN(;(;<<===r$   c                 .   | j         s|                    t          df          s!| j                            |t
                    S | j                            | j        |z   t
                    }t          |          | j        k    r|| j        d         n|S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        r   )out_typeN)	r7   
startswithr   r=   r   r   r3   r?   r   )rJ   r   rK   r   s       r"   	_tokenizezT5Tokenizer._tokenize~  s     ; 	<doo/?.EFF 	<=''s';;; %%dnt&;c%JJ25f++AV2V2Vvd+--..\bbr$   c                 6    | j                             |          S )z0Converts a token (str) in an id using the vocab.)r=   piece_to_id)rJ   r   s     r"   _convert_token_to_idz T5Tokenizer._convert_token_to_id  s    }((///r$   c                 :    | j                             |          }|S )z=Converts an index (integer) in a token (str) using the vocab.)r=   	IdToPiece)rJ   indexr   s      r"   _convert_id_to_tokenz T5Tokenizer._convert_id_to_token  s    ''..r$   c                    |d                              t                    r| j        r|d         dd         |d<   g }d}d}|D ]N}|| j        v r,|s|dz  }|| j                            |          |z   z  }d}g }7|                    |           d}O|| j                            |          z  }|                                S )z:Converts a sequence of tokens (string) in a single string.r   r%   N Fr   T)r   r   r8   r   r=   decodeappendstrip)rJ   r   current_sub_tokens
out_stringprev_is_specialr   s         r"   convert_tokens_to_stringz$T5Tokenizer.convert_tokens_to_string  s     !9 011 	&d6K 	&q	!""F1I
 
	( 
	(E///& &#%Jdm223EFFNN
"&%'"""))%000"'dm**+=>>>
!!!r$   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    r:t           j        	                    | j                  rt          | j        |           nzt           j        	                    | j                  sVt          |d          5 }| j                                        }|                    |           d d d            n# 1 swxY w Y   |fS )NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirrC   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   rO   r=   serialized_model_protowrite)rJ   r   r   out_vocab_fileficontent_spiece_models         r"   save_vocabularyzT5Tokenizer.save_vocabulary  sw   w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNSUSZSaSabfbqSrSrNT_n555500 	/nd++ /r'+}'K'K'M'M$-.../ / / / / / / / / / / / / / /   s   (/E##E'*E')r   r   r   r   NNNT)F)NFri   )&rQ   
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   r   r   r   rI   rF   staticmethodrg   propertyrl   rt   r   intr|   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   __classcell__)rE   s   @r"   r   r   -   s-       L L\ *$&67
 "&48H
 H
 "$sCx.1H
 
H
 H
 H
 H
 H
 H
V   "     \ * . . X.   sxO O9O3;DI3FOkoO	cO O O O O O8
 
 

[ [ [	3c 	3tCy 	3 	3 	3 	3 JN@ @9@3;DI3F@	c@ @ @ @. JN- -9-3;DI3F-	c- - - -4  
, , ,[ tCy      $ > > X>c c c$0 0 0  
" " ".! !c !HSM !]bcf]g ! ! ! ! ! ! ! !r$   r   )r   r   r}   r`   shutilr   typingr   r   r   r   r   r	   sentencepiecer;   convert_slow_tokenizerr   tokenization_utilsr   tokenization_utils_baser   r   utilsr   
get_loggerrQ   rC   r   r   r   r)   r$   r"   <module>r      s@   ' & 				 				        B B B B B B B B B B B B B B B B     5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1  5444444       
	H	%	%!>2 
  R! R! R! R! R!% R! R! R! R! R!r$   