
    g0                         d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlmZ ddlmZ  ej        e          ZdZd	d
iZ G d de          ZdS )zTokenization classes for .    N)copyfile)AnyDictListOptionalTuple   )PreTrainedTokenizer)loggingu   ▁
vocab_filezsentencepiece.bpe.modelc            
           e Zd ZdZeZddgZ	 	 	 	 	 	 	 dd	eee	e
f                  d
df fdZd Zd Z	 d dee         deee                  d
ee         fdZ	 d!dee         deee                  ded
ee         f fdZ	 d dee         deee                  d
ee         fdZed             Zd Zde	d
ee	         fdZd Zd Zd Zd de	dee	         d
ee	         fdZ xZS )"XGLMTokenizera  
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_mask<s></s><unk><pad>Nsp_model_kwargsreturnc	                    	 |i n| _         d _        d t           j                  D             }
	                    dg           pg 	d<   	dxx         	fd|
D             z  cc<   t	          j        di  j          _         j                            t          |                     | _	        d _
        ddddd	 _        t           j                   fd
t           j                  D             }
 j                            |
           d  j                                        D              _         t!                      j        d|||||| j         d	 d S )N   c                     g | ]}d | d	S z<madeupword> ).0is     f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/xglm/tokenization_xglm.py
<listcomp>z*XGLMTokenizer.__init__.<locals>.<listcomp>u   s$    QQQq*a***QQQ    additional_special_tokensc                 (    g | ]}|d          v|S )r"   r   )r   wordkwargss     r   r    z*XGLMTokenizer.__init__.<locals>.<listcomp>x   s0     0
 0
 0
T@[9\-\-\D-\-\-\r!      r      r	   )r   r   r   r   c                 4    i | ]}d | d|z   j         z   S r   )fairseq_offset)r   r   selfsp_sizes     r   
<dictcomp>z*XGLMTokenizer.__init__.<locals>.<dictcomp>   s4    tttRS*a***GaK$:M,Mtttr!   c                     i | ]\  }}||	S r   r   )r   kvs      r   r,   z*XGLMTokenizer.__init__.<locals>.<dictcomp>   s    %Z%Z%Ztq!a%Z%Z%Zr!   )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_tokenr   r   )r   num_madeup_wordsrangegetspmSentencePieceProcessorsp_modelLoadstrr   r)   fairseq_tokens_to_idslenupdateitemsfairseq_ids_to_tokenssuper__init__)r*   r   r0   r1   r3   r4   r2   r5   r   r%   madeup_wordsr+   	__class__s   `        ` @r   rD   zXGLMTokenizer.__init__e   s    &5%<rr/ !"QQE$:O4P4PQQQ.4jj9TVX.Y.Y._]_*+*+++ 0
 0
 0
 0
)0
 0
 0
 	
+++ 2JJT5IJJ3z??+++$   ./APQ%R%R"dm$$tttttW\]a]rWsWsttt")),777%Z%Zt7Q7W7W7Y7Y%Z%Z%Z" 		
 0		
 		
 		
 		
 		
 		
 		
r!   c                 z    | j                                         }d |d<   | j                                        |d<   |S )Nr;   sp_model_proto)__dict__copyr;   serialized_model_proto)r*   states     r   __getstate__zXGLMTokenizer.__getstate__   s=    ""$$ j"&-"F"F"H"Hr!   c                     || _         t          | d          si | _        t          j        di | j        | _        | j                            | j                   d S )Nr   r   )rI   hasattrr   r9   r:   r;   LoadFromSerializedProtorH   )r*   ds     r   __setstate__zXGLMTokenizer.__setstate__   s`     t.// 	&#%D 2JJT5IJJ--d.ABBBBBr!   token_ids_0token_ids_1c                 H    || j         g|z   S | j         g}||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idr*   rS   rT   seps       r    build_inputs_with_special_tokensz.XGLMTokenizer.build_inputs_with_special_tokens   s@    ( %&44 ![ 3&,{::r!   Falready_has_special_tokensc                     |r$t                                          ||d          S |dgdgt          |          z  z   S dgdgt          |          z  z   ddgz   dgt          |          z  z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rS   rT   rZ   Nr&   r   )rC   get_special_tokens_maskr?   )r*   rS   rT   rZ   rF   s       r   r\   z%XGLMTokenizer.get_special_tokens_mask   s    & & 	7722'[]a 3    31#K 0 0011sqcC,,,-A61#K@P@P:PQQr!   c                     | j         g}|t          ||z             dgz  S t          ||z   |z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        Nr   )rV   r?   rW   s       r   $create_token_type_ids_from_sequencesz2XGLMTokenizer.create_token_type_ids_from_sequences   sX    $  !s[())QC//3$s*S0;>??1#EEr!   c                 J    t          | j                  | j        z   | j        z   S N)r?   r;   r)   r6   )r*   s    r   
vocab_sizezXGLMTokenizer.vocab_size   s"    4=!!D$77$:OOOr!   c                 |      fdt           j                  D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r   )convert_ids_to_tokens)r   r   r*   s     r   r,   z+XGLMTokenizer.get_vocab.<locals>.<dictcomp>   s)    RRRa++A..RRRr!   )r7   ra   r@   added_tokens_encoder)r*   vocabs   ` r   	get_vocabzXGLMTokenizer.get_vocab   s@    RRRR5;Q;QRRRT.///r!   textc                 D    | j                             |t                    S )N)out_type)r;   encoder=   )r*   rh   s     r   	_tokenizezXGLMTokenizer._tokenize  s    }##D3#777r!   c                     || j         v r| j         |         S | j                            |          }|r
|| j        z   n| j        S )z0Converts a token (str) in an id using the vocab.)r>   r;   	PieceToIdr)   unk_token_id)r*   tokenspm_ids      r   _convert_token_to_idz"XGLMTokenizer._convert_token_to_id  sP    D...-e44((// 06Lv+++4;LLr!   c                 r    || j         v r| j         |         S | j                            || j        z
            S )z=Converts an index (integer) in a token (str) using the vocab.)rB   r;   	IdToPiecer)   )r*   indexs     r   _convert_id_to_tokenz"XGLMTokenizer._convert_id_to_token  s<    D...-e44}&&ut/B'BCCCr!   c                     d                     |                              t          d                                          }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r*   tokens
out_strings      r   convert_tokens_to_stringz&XGLMTokenizer.convert_tokens_to_string  s4    WWV__,,-=sCCIIKK
r!   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    r:t           j        	                    | j                  rt          | j        |           nzt           j        	                    | j                  sVt          |d          5 }| j                                        }|                    |           d d d            n# 1 swxY w Y   |fS )NzVocabulary path (z) should be a directory-rx   r   wb)ospathisdirloggererrorrz   VOCAB_FILES_NAMESabspathr   isfiler   openr;   rK   write)r*   r   r   out_vocab_fileficontent_spiece_models         r   save_vocabularyzXGLMTokenizer.save_vocabulary  sw   w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNSUSZSaSabfbqSrSrNT_n555500 	/nd++ /r'+}'K'K'M'M$-.../ / / / / / / / / / / / / / /   s   (/E##E'*E')r   r   r   r   r   r   Nr`   )NF)__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   r   r=   r   rD   rM   rR   r   intrY   boolr\   r^   propertyra   rg   rl   rr   rv   r   r   r   __classcell__)rF   s   @r   r   r   "   sz       = =~ *$&67
 486
 6
 "$sCx.16
 
6
 6
 6
 6
 6
 6
p  C C C JN; ;9;3;DI3F;	c; ; ; ;4 sxR R9R3;DI3FRkoR	cR R R R R R: JNF F9F3;DI3FF	cF F F F0 P P XP  
8c 8d3i 8 8 8 8M M MD D D  
! !c !HSM !]bcf]g ! ! ! ! ! ! ! !r!   r   )r   r   shutilr   typingr   r   r   r   r   sentencepiecer9   tokenization_utilsr
   utilsr   
get_loggerr   r   r|   r   r   r   r!   r   <module>r      s    !   				       3 3 3 3 3 3 3 3 3 3 3 3 3 3     5 5 5 5 5 5       
	H	%	% !#<= G! G! G! G! G!' G! G! G! G! G!r!   