
    gR                     z    d Z ddlmZmZmZmZ ddlmZmZ ddl	m
Z
  e
j        e          Z G d de          ZdS )	z!Tokenization class for Perceiver.    )DictListOptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingc            
       d    e Zd ZdZddgZ	 	 	 	 	 	 	 d	 d fdZdeeef         fdZ	e
d             Z	 d dee         deee                  dedee         f fdZ	 d!dee         deee                  dee         fdZdedee         fdZd Zd Zd Zd!dedee         dee         fdZ xZS )"PerceiverTokenizeraS  
    Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
            The token used for padding, for example when batching sequences of different lengths.
        bos_token (`str`, *optional*, defaults to `"[BOS]"`):
            The BOS token (reserved in the vocab, but not actually used).
        eos_token (`str`, *optional*, defaults to `"[EOS]"`):
            The end of sequence token (reserved in the vocab, but not actually used).

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The MASK token, useful for masked language modeling.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The CLS token (reserved in the vocab, but not actually used).
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from two sequences.

    	input_idsattention_mask[PAD][BOS][EOS][MASK][CLS][SEP]   returnNc                    t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}d| _        ||||||d| _        t          | j                  | _         t                      j        d|||||||d| d S )NF)lstriprstrip   )r         r         )	pad_token	bos_token	eos_token
mask_token	cls_token	sep_tokenmodel_max_length )	
isinstancestrr   _utf_vocab_size_added_tokens_decoderlen_num_special_tokenssuper__init__)
selfr   r    r!   r"   r#   r$   r%   kwargs	__class__s
            p/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/perceiver/tokenization_perceiver.pyr.   zPerceiverTokenizer.__init__;   s    JTT]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	KUV`beKfKfvZ
5GGGGlv
IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	# 6
 6
" $'t'A#B#B  		
!-		
 		
 		
 		
 		
 		
 		
    c                     i }t          | j                  D ]}t          |          }|| j        z   ||<   |                    | j                   |S N)ranger)   chrr,   updateadded_tokens_encoder)r/   vocabitokens       r2   	get_vocabzPerceiverTokenizer.get_vocabd   sY    t+,, 	8 	8AFFEt77E%LLT.///r3   c                     | j         S r5   )r)   )r/   s    r2   
vocab_sizezPerceiverTokenizer.vocab_sizel   s    ##r3   Ftoken_ids_0token_ids_1already_has_special_tokensc                     |r$t                                          ||d          S |dgdgt          |          z  z   dgz   S dgdgt          |          z  z   dgz   dgt          |          z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r@   rA   rB   Nr   r   )r-   get_special_tokens_maskr+   )r/   r@   rA   rB   r1   s       r2   rD   z*PerceiverTokenizer.get_special_tokens_maskp   s    $ & 	7722'[]a 3   
 3!s;////1#55sqcC,,,-3sS=M=M7MNRSQTTTr3   c                 n    || j         g|z   | j        gz   S | j         g|z   | j        gz   |z   | j        gz   S )af  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the
        following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )cls_token_idsep_token_id)r/   r@   rA   s      r2    build_inputs_with_special_tokensz3PerceiverTokenizer.build_inputs_with_special_tokens   sS    & %&48I7JJJ%&48I7JJ[X\`\m[nnnr3   textc                 D    d |                     d          D             }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 ,    g | ]}t          |          S r&   )r7   ).0r;   s     r2   
<listcomp>z0PerceiverTokenizer._tokenize.<locals>.<listcomp>   s    777Q#a&&777r3   utf-8)encode)r/   rI   tokenss      r2   	_tokenizezPerceiverTokenizer._tokenize   s&    77$++g"6"6777r3   c                 j    t          |          dk    r| j        }nt          |          | j        z   }|S )z0Converts a token (str) in an id using the vocab.r   )r+   unk_token_idordr,   )r/   r<   token_ids      r2   _convert_token_to_idz'PerceiverTokenizer._convert_token_to_id   s3    u::??(HH5zzD$<<Hr3   c                 4    t          || j        z
            }|S )z=Converts an index (integer) in a token (str) using the vocab.)r7   r,   )r/   indexr<   s      r2   _convert_id_to_tokenz'PerceiverTokenizer._convert_id_to_token   s    ED4455r3   c                     d}|D ]P}|| j         v r#t          |                              d          }nt          t	          |          g          }||z  }Q|                    dd          }|S )z:Converts a sequence of tokens (string) in a single string.r3   rN   replace)errors)r9   r(   rO   bytesrT   decode)r/   rP   bstringr<   
tok_stringstrings         r2   convert_tokens_to_stringz+PerceiverTokenizer.convert_tokens_to_string   sz     	" 	"E111 ZZ..w77

"CJJ<00
z!GG	::r3   save_directoryfilename_prefixc                     dS )Nr&   r&   )r/   rc   rd   s      r2   save_vocabularyz"PerceiverTokenizer.save_vocabulary   s    rr3   )r   r   r   r   r   r   r   )r   N)NFr5   )__name__
__module____qualname____doc__model_input_namesr.   r   r(   intr=   propertyr?   r   r   boolrD   rH   rQ   rV   rY   rb   r   rf   __classcell__)r1   s   @r2   r   r      s        < %&67 '
 
'
 '
 '
 '
 '
 '
R4S>     $ $ X$ sxU U9U3;DI3FUkoU	cU U U U U U: JNo o9o3;DI3Fo	co o o o0c d3i    
    
 
 
 c HSM ]bcf]g        r3   r   N)rj   typingr   r   r   r   tokenization_utilsr   r	   utilsr
   
get_loggerrg   loggerr   r&   r3   r2   <module>ru      s    ( ' . . . . . . . . . . . . A A A A A A A A       
	H	%	%k k k k k, k k k k kr3   