
    gG$                         U d Z ddlmZmZmZ ddlmZmZ ddlm	Z	  e	j
        e          ZdZdZdZdZd	Zd
ZdZedededededediZeeef         ed<   d e                                D             Zeeef         ed<    G d de          ZdS )z Tokenization classes for CANINE.    )DictListOptional   )
AddedTokenPreTrainedTokenizer)loggingi   i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSc                     i | ]\  }}||	S  r   ).0	codepointnames      j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/canine/tokenization_canine.py
<dictcomp>r   7   s    -p-p-p/)TdI-p-p-p    SPECIAL_CODEPOINTS_BY_NAMEc            
           e Zd ZdZ ee           ee           ee           ee           ee           ee          ddf fd	Z	e
defd            Zd Zdedee         fd	Zd
edefdZdedefdZd Z	 ddee         deee                  dee         fdZ	 ddee         deee                  dedee         f fdZ	 ddee         deee                  dee         fdZddedee         fdZ xZS )CanineTokenizera  
    Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
    converts each character into its Unicode code point.

    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

    Args:
        model_max_length (`int`, *optional*, defaults to 2048):
                The maximum sentence length the model accepts.
    Fi   c	                 2   t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}i | _        t                                          D ]\  }
}|
| j        |<   d | j                                        D             | _        t          | _        t          | j                  | _
         t                      j        d||||||||d|	 d S )NF)lstriprstripTc                     i | ]\  }}||	S r   r   )r   r   r   s      r   r   z,CanineTokenizer.__init__.<locals>.<dictcomp>c   s+     ;
 ;
 ;
 /iIt;
 ;
 ;
r   )	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_lengthr   )
isinstancestrr   _special_codepointsr
   items_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelen_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r    r!   kwargsr   r   	__class__s               r   r,   zCanineTokenizer.__init__H   s    JTT]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	 KUU_adJeJeuZ
4FFFFku
 46 17799 	7 	7OIt-6D$T**;
 ;
373K3Q3Q3S3S;
 ;
 ;
' $6 #&t'?#@#@  
	
!--
	
 
	
 
	
 
	
 
	
 
	
 
	
r   returnc                     | j         S N)r(   )r-   s    r   
vocab_sizezCanineTokenizer.vocab_sizev   s    ''r   c                 v    d t          | j                  D             }|                    | j                   |S )Nc                 .    i | ]}t          |          |S r   )chr)r   is     r   r   z-CanineTokenizer.get_vocab.<locals>.<dictcomp>{   s     ;;;qQ;;;r   )ranger3   updateadded_tokens_encoder)r-   vocabs     r   	get_vocabzCanineTokenizer.get_vocabz   s9    ;;E$/$:$:;;;T.///r   textc                      t          |          S )z5Tokenize a string (i.e. perform character splitting).)list)r-   r=   s     r   	_tokenizezCanineTokenizer._tokenize   s    Dzzr   tokenc                 d    	 t          |          S # t          $ r t          d| d          w xY w)zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: '')ord	TypeError
ValueError)r-   rA   s     r   _convert_token_to_idz$CanineTokenizer._convert_token_to_id   sH    	:u:: 	: 	: 	:8888999	:s    /indexc                     	 |t           v rt           |         S t          |          S # t          $ r t          d|           w xY w)z
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        zinvalid id: )r
   r6   rE   rF   )r-   rH   s     r   _convert_id_to_tokenz$CanineTokenizer._convert_id_to_token   s\    
	5***)%00u:: 	5 	5 	53E33444	5s   ' ' Ac                 ,    d                     |          S )N )join)r-   tokenss     r   convert_tokens_to_stringz(CanineTokenizer.convert_tokens_to_string   s    wwvr   Ntoken_ids_0token_ids_1c                 J    | j         g}| j        g}||z   |z   }||||z   z  }|S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CANINE sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idcls_token_idr-   rP   rQ   sepclsresults         r    build_inputs_with_special_tokensz0CanineTokenizer.build_inputs_with_special_tokens   sC    &  ! !{"S("kC''Fr   already_has_special_tokensc                     |r$t                                          ||d          S dgdgt          |          z  z   dgz   }||dgt          |          z  dgz   z  }|S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rP   rQ   rZ      r   )r+   get_special_tokens_maskr)   )r-   rP   rQ   rZ   rX   r/   s        r   r]   z'CanineTokenizer.get_special_tokens_mask   s    $ & 	7722'[]a 3    c+.../1#5"sS---!44Fr   c                     | j         g}| j        g}t          ||z   |z             dgz  }||t          ||z             dgz  z  }|S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   Nr\   )rS   rT   r)   rU   s         r   $create_token_type_ids_from_sequencesz4CanineTokenizer.create_token_type_ids_from_sequences   sa    .  ! !S;&,--3"c++,,s22Fr   save_directoryfilename_prefixc                     dS )Nr   r   )r-   r`   ra   s      r   save_vocabularyzCanineTokenizer.save_vocabulary   s    rr   r2   )NF)__name__
__module____qualname____doc__r6   CLSSEPPADMASKr,   propertyintr3   r<   r#   r   r@   rG   rJ   rO   r   rY   boolr]   r_   rc   __classcell__)r/   s   @r   r   r   :   sZ         #c((#c((#c((#c((#c((3t99,
 ,
 ,
 ,
 ,
 ,
\ (C ( ( ( X(  
c d3i    :# :# : : : :
5# 
5# 
5 
5 
5 
5   JN 93;DI3F	c   8 sx 93;DI3Fko	c     : JN 93;DI3F	c   @ c HSM        r   r   N)rg   typingr   r   r   tokenization_utilsr   r   utilsr	   
get_loggerrd   loggerr'   rj   rh   ri   BOSrk   RESERVEDr
   rm   r#   __annotations__r%   r   r   r   r   r   <module>rx      sK   ' & & ' ' ' ' ' ' ' ' ' ' A A A A A A A A       
	H	%	%    (l& DcN     .q-pUgUmUmUoUo-p-p-p DcN p p pw w w w w) w w w w wr   