
    gP7                         d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
ZddlmZmZ ddlmZ erddlmZ  ej        e          Zdd	iZd
Z G d de          ZdgZdS )    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )
AddedTokenPreTrainedTokenizer)logging)	TextInput
vocab_fileztokenizer.modelu   ▁c            
           e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 	 	 d%deee	e
f                  f fdZd Zd Zed             Zd Zdddee	         f fdZd Zd Zd Zd Zd&dee	         dee	         fdZd&dZ	 d'dee         deee                  dedee         f fdZ	 d&dee         deee                  dee         fd Z	 	 d(d!ee         d"ed#ede	fd$Z xZS ))GemmaTokenizera
  
    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Gemma should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
    	input_idsattention_mask<unk><bos><eos><pad>NTFsp_model_kwargsc                 P   |i n|| _         t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}|| _        || _        || _        |
| _        t          j	        di | j         | _
        | j
                            |            t                      j        d||||||||	|
|d
| d S )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokens )r   
isinstancestrr   r   r    r!   r#   spmSentencePieceProcessorsp_modelLoadsuper__init__)selfr   r   r   r   r   r   r    r!   r"   r#   r$   kwargs	__class__s                h/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/gemma/tokenization_gemma.pyr-   zGemmaTokenizer.__init__\   si    &5%<rr/MWXacfMgMgvJyUDIIIImv	MWXacfMgMgvJyUDIIIImv	MWXacfMgMgvJyUDIIIImv	MWXacfMgMgvJyUDIIIImv	$**)B&2JJT5IJJ:&&& 	
''+)E&?*G	
 	
 	
 	
 	
 	
 	
    c                 z    | j                                         }d |d<   | j                                        |d<   |S )Nr*   sp_model_proto)__dict__copyr*   serialized_model_proto)r.   states     r1   __getstate__zGemmaTokenizer.__getstate__   s=    ""$$ j"&-"F"F"H"Hr2   c                     || _         t          j        di | j        | _        | j                            | j                   d S )Nr%   )r5   r(   r)   r   r*   LoadFromSerializedProtor4   )r.   ds     r1   __setstate__zGemmaTokenizer.__setstate__   sC    2JJT5IJJ--d.ABBBBBr2   c                 4    | j                                         S )zReturns vocab size)r*   get_piece_size)r.   s    r1   
vocab_sizezGemmaTokenizer.vocab_size   s     }++---r2   c                 |      fdt           j                  D             }|                     j                   |S )zReturns vocab as a dictc                 <    i | ]}                     |          |S r%   )convert_ids_to_tokens).0ir.   s     r1   
<dictcomp>z,GemmaTokenizer.get_vocab.<locals>.<dictcomp>   s)    RRRa++A..RRRr2   )ranger@   updateadded_tokens_encoder)r.   vocabs   ` r1   	get_vocabzGemmaTokenizer.get_vocab   s@    RRRR5;Q;QRRRT.///r2   textr   returnc                 8     t                      j        |fi |S )ze
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        )r,   tokenize)r.   rL   r/   r0   s      r1   rO   zGemmaTokenizer.tokenize   s$      uww/////r2   c                 D    | j                             |t                    S )z
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        )out_type)r*   encoder'   )r.   rL   r/   s      r1   	_tokenizezGemmaTokenizer._tokenize   s     }##D3#777r2   c                 6    | j                             |          S )z0Converts a token (str) in an id using the vocab.)r*   piece_to_id)r.   tokens     r1   _convert_token_to_idz#GemmaTokenizer._convert_token_to_id   s    }((///r2   c                 :    | j                             |          }|S )z=Converts an index (integer) in a token (str) using the vocab.)r*   	IdToPiece)r.   indexrV   s      r1   _convert_id_to_tokenz#GemmaTokenizer._convert_id_to_token   s    ''..r2   c                     g }d}|D ]C}|| j         v r#|| j                            |          |z   z  }g }.|                    |           D|| j                            |          z  }|S )z:Converts a sequence of tokens (string) in a single string. )_added_tokens_encoderr*   decodeappend)r.   tokenscurrent_sub_tokens
out_stringrV   s        r1   convert_tokens_to_stringz'GemmaTokenizer.convert_tokens_to_string   s    
 	1 	1E222dm223EFFNN
%'"""))%0000dm**+=>>>
r2   filename_prefixc                    t           j                            |          s t                              d| d           dS t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    r:t           j        	                    | j                  rt          | j        |           nzt           j        	                    | j                  sVt          |d          5 }| j                                        }|                    |           ddd           n# 1 swxY w Y   |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-r]   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr*   r7   write)r.   save_directoryre   out_vocab_fileficontent_spiece_models         r1   save_vocabularyzGemmaTokenizer.save_vocabulary   sy    w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNSUSZSaSabfbqSrSrNT_n555500 	/nd++ /r'+}'K'K'M'M$-.../ / / / / / / / / / / / / / /   s   (/E##E'*E'c                 t    | j         r| j        gng }| j        r| j        gng }||z   |z   }|||z   |z   |z   }|S N)r    bos_token_idr!   eos_token_idr.   token_ids_0token_ids_1r{   r|   outputs         r1    build_inputs_with_special_tokensz/GemmaTokenizer.build_inputs_with_special_tokens   s`    .2.@H)**b.2.@H)**b+l:"l*[8<GFr2   r~   r   already_has_special_tokensc                    |r$t                                          ||d          S | j        rdgng }| j        rdgng }||dgt	          |          z  z   |z   S |dgt	          |          z  z   |z   |z   dgt	          |          z  z   |z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r~   r   r      Nr   )r,   get_special_tokens_maskr    r!   len)r.   r~   r   r   r{   r|   r0   s         r1   r   z&GemmaTokenizer.get_special_tokens_mask   s    $ & 	7722'[]a 3    #08ssb"08ssbA3[)9)9#9:\IIsS%%%'  sS%%%	'
 	
r2   c                     | j         r| j        gng }| j        r| j        gng }dgt	          ||z   |z             z  }||dgt	          ||z   |z             z  z  }|S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   Nr   )r    r{   r!   r|   r   r}   s         r1   $create_token_type_ids_from_sequencesz3GemmaTokenizer.create_token_type_ids_from_sequences  s    . /3.@H)**b.2.@H)**bs<+5DEEE"qcC{ :\ IJJJJFr2   	token_idsskip_special_tokensr$   c                    g }g }|D ]}|r
|| j         v r|| j        v rW|r-|                    | j                            |                     |                    | j        |         j                   g }n|                    |           |r-|                    | j                            |                     |rd                    |          }nd                    |          }|                    t          d          S )N r]   )	all_special_ids_added_tokens_decoderr`   r*   r_   contentrn   replaceSPIECE_UNDERLINE)r.   r   r   r$   r/   	sub_textscurrent_sub_textidss           r1   _decodezGemmaTokenizer._decode/  s"    	 		- 		-C" sd.B'B'Bd000# M$$T]%9%9:J%K%KLLL  !;C!@!HIII#%   '',,,, 	ET]112BCCDDD( 	+++II	**I  !13777r2   )
r   r   r   r   NTFFFFrz   )NF)FF) __name__
__module____qualname____doc__ro   vocab_files_namesmodel_input_namesr   r   r'   r   r-   r9   r=   propertyr@   rK   r   rO   rS   rW   r[   rd   r	   rx   r   intboolr   r   r   __classcell__)r0   s   @r1   r   r   *   s       , ,\ *$&67
 48%*"'&+(
 (
 "$sCx.1(
 (
 (
 (
 (
 (
T  C C C
 . . X.  0[ 0tCy 0 0 0 0 0 08 8 80 0 0  
  ! !x} !X]^aXb ! ! ! !6	 	 	 	 sx#
 #
9#
3;DI3F#
ko#
	c#
 #
 #
 #
 #
 #
L JN 93;DI3F	c   H %*.3	8 898 "8 (,	8 
8 8 8 8 8 8 8 8r2   r   )ri   shutilr   typingr   r   r   r   r   r	   sentencepiecer(   tokenization_utilsr   r   utilsr   tokenization_utils_baser   
get_loggerr   rl   ro   r   r   __all__r%   r2   r1   <module>r      s
  , 
			       B B B B B B B B B B B B B B B B     A A A A A A A A        5444444		H	%	%!#45  `8 `8 `8 `8 `8( `8 `8 `8F	 
r2   