
    g-'                     ~    d Z ddlZddlmZmZmZ ddlmZmZ ddl	m
Z
  e
j        e          Z G d de          ZdS )	z"Tokenization class for model ByT5.    N)ListOptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingc            
           e Zd ZdZddgZ	 	 	 	 	 d	 d fd
Zed             Zd Z	 d de	e
         dee	e
                  ded	e	e
         f fdZde	e
         d	e	e
         fdZ	 d!de	e
         dee	e
                  d	e	e
         fdZ	 d!de	e
         dee	e
                  d	e	e
         fdZded	e	e         fdZd Zd Zd Zd!dedee         d	ee         fdZ xZS )"ByT5Tokenizera  
    Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    	input_idsattention_mask</s><unk><pad>}   Nreturnc           	         |dk    r|d t          |          D             }nb|dk    r\|Zt          |          dk    rGt          t          t          d |                              }||k    rt	          d| d| d          t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}|||d	| _        t          | j                  | _	        d
| _
         t                      j        d|||d|d| d S )Nr   c                     g | ]}d | d	S )z
<extra_id_> .0is     f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/byt5/tokenization_byt5.py
<listcomp>z*ByT5Tokenizer.__init__.<locals>.<listcomp>I   s$    (U(U(Uq):a):):):(U(U(U    c                 >    t          dt          |           v           S )Nextra_id)boolstr)xs    r   <lambda>z(ByT5Tokenizer.__init__.<locals>.<lambda>L   s    Ds1vv9M4N4N r   zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to ByT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r            )	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   )rangelensetfilter
ValueError
isinstancer    r   _added_tokens_decoderoffset_utf_vocab_sizesuper__init__)	selfr(   r)   r*   r+   r,   kwargsextra_tokens	__class__s	           r   r7   zByT5Tokenizer.__init__>   s    q==6>(U(UE)DTDT(U(U(U%%]]8DMfIgIgjkIkIks6*N*NPi#j#jkkllLy(( (y ( (Sl ( ( (   HRR[]`GaGapJydCCCCgp	GQR[]`GaGapJydCCCCgp	GQR[]`GaGapJydCCCCgp	)2yY%O%O"$455# 	
&?	
 	
 	
 	
 	
 	
 	
r   c                     | j         S N)r5   )r8   s    r   
vocab_sizezByT5Tokenizer.vocab_sizee   s    ##r   c                       fdt           j         j        z             D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r   )convert_ids_to_tokens)r   r   r8   s     r   
<dictcomp>z+ByT5Tokenizer.get_vocab.<locals>.<dictcomp>j   s)    ```a++A..```r   )r-   r>   r4   updateadded_tokens_encoder)r8   vocabs   ` r   	get_vocabzByT5Tokenizer.get_vocabi   sI    ````5SWS^A^;_;_```T.///r   Ftoken_ids_0token_ids_1already_has_special_tokensc                     |r$t                                          ||d          S |dgt          |          z  dgz   S dgt          |          z  dgz   dgt          |          z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rG   rH   rI   Nr   r%   )r6   get_special_tokens_maskr.   )r8   rG   rH   rI   r;   s       r   rK   z%ByT5Tokenizer.get_special_tokens_maskn   s    $ & 	7722'[]a 3   
 C#k***qc11c+&&&1#-!s;7G7G1GHA3NNr   	token_idsc                     t          |          dk    r0|d         | j        k    rt          j        d| j         d           |S || j        gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r.   eos_token_idwarningswarnr(   )r8   rL   s     r   _add_eos_if_not_presentz%ByT5Tokenizer._add_eos_if_not_present   si    y>>A)B-43D"D"DM+T^ + + +    1222r   c                 z    | j         g}|t          ||z             dgz  S t          ||z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )rO   r.   )r8   rG   rH   eoss       r   $create_token_type_ids_from_sequencesz2ByT5Tokenizer.create_token_type_ids_from_sequences   sS       !{S())QC//;${2S899QC??r   c                 h    |                      |          }||S |                      |          }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )rR   )r8   rG   rH   s      r    build_inputs_with_special_tokensz.ByT5Tokenizer.build_inputs_with_special_tokens   sA    & 22;??66{CCK,,r   textc                 D    d |                     d          D             }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 ,    g | ]}t          |          S r   )chrr   s     r   r   z+ByT5Tokenizer._tokenize.<locals>.<listcomp>   s    777Q#a&&777r   utf-8)encode)r8   rX   tokenss      r   	_tokenizezByT5Tokenizer._tokenize   s&    77$++g"6"6777r   c                 `    t          |          dk    rd}nt          |          | j        z   }|S )z0Converts a token (str) in an id using the vocab.r%   N)r.   ordr4   )r8   tokentoken_ids      r   _convert_token_to_idz"ByT5Tokenizer._convert_token_to_id   s1     u::??HH5zzDK/Hr   c                 4    t          || j        z
            }|S )z=Converts an index (integer) in a token (str) using the vocab.)r[   r4   )r8   indexrb   s      r   _convert_id_to_tokenz"ByT5Tokenizer._convert_id_to_token   s    EDK'((r   c                    d}|D ]m}|| j         v r!| j         |                             d          }n<|| j        v r|                    d          }nt          t	          |          g          }||z  }n|                    dd          }|S )z:Converts a sequence of tokens (string) in a single string.r   r\   ignore)errors)added_tokens_decoderr]   rD   bytesra   decode)r8   r^   bstringrb   
tok_stringstrings         r   convert_tokens_to_stringz&ByT5Tokenizer.convert_tokens_to_string   s     	" 	"E111!6u=DDWMM

$333"\\'22

"CJJ<00
z!GG99r   save_directoryfilename_prefixc                     dS )Nr   r   )r8   rr   rs   s      r   save_vocabularyzByT5Tokenizer.save_vocabulary   s    rr   )r   r   r   r   N)r   N)NFr=   )__name__
__module____qualname____doc__model_input_namesr7   propertyr>   rF   r   intr   r   rK   rR   rU   rW   r    r_   rd   rg   rq   r   ru   __classcell__)r;   s   @r   r   r      sR        @ %&67 "&%
 
%
 %
 %
 %
 %
 %
N $ $ X$   sxO O9O3;DI3FOkoO	cO O O O O O8	3c 	3tCy 	3 	3 	3 	3 JN@ @9@3;DI3F@	c@ @ @ @. JN- -9-3;DI3F-	c- - - -4c d3i    
    
   c HSM ]bcf]g        r   r   )ry   rP   typingr   r   r   tokenization_utilsr   r   utilsr	   
get_loggerrv   loggerr   r   r   r   <module>r      s    ) (  ( ( ( ( ( ( ( ( ( ( A A A A A A A A       
	H	%	%N N N N N' N N N N Nr   