
    gH                         d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZ erdd	lmZ  ej        e          Zd
diZdZd\  ZZd\  ZZdZ G d de          Z dS )zTokenization classes for LLaMA.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)
AddedTokenPreTrainedTokenizer)logging)	TextInput
vocab_fileztokenizer.modelu   ▁)z[INST]z[/INST])z<<SYS>>
z
<</SYS>>

a  You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.c            
           e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 	 	 	 	 d"d
eee	e
f                  f fdZed             Zd#dZd Zd Zed             Zd Zdddee	         f fdZd Zd Zd Zd Zd$dee	         dee	         fdZd$dZ	 d%dee         deee                  dedee         f fd Z	 d$dee         deee                  dee         fd!Z xZ S )&LlamaTokenizeru  
    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Llama should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
        legacy (`bool`, *optional*):
            Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
            and #25224 which includes fixes to properly handle tokens that appear after special tokens.
            Make sure to also set `from_slow` to `True`.
            A simple example:

            - `legacy=True`:
            ```python
            >>> from transformers import LlamaTokenizerFast

            >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
            >>> tokenizer.encode("Hello <s>.") # 869 is '▁.'
            [1, 15043, 29871, 1, 869]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import LlamaTokenizerFast

            >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
            >>> tokenizer.encode("Hello <s>.")  # 29889 is '.'
            [1, 15043, 29871, 1, 29889]
            ```
            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
        add_prefix_space (`bool`, *optional*, defaults to `True`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. Again, this should be set with `from_slow=True` to make sure it's taken into account.
    	input_idsattention_mask<unk><s></s>NTFsp_model_kwargsc                    |i n|| _         t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}|%t                              d| j         d           d}|| _        || _        || _	        || _
        |
| _        |                     |                    dd                    | _        || _         t!                      j        d||||||| j         |	|
|||d| d S )	NFT)
normalizedspecialz2You are using the default legacy behaviour of the a  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message	from_slow)	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokenslegacyadd_prefix_space )r   
isinstancestrr   loggerwarning_once	__class__r&   r   r!   r"   r$   get_spm_processorpopsp_modelr'   super__init__)selfr   r   r   r   r    r   r!   r"   r#   r$   r%   r&   r'   kwargsr-   s                  h/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/llama/tokenization_llama.pyr2   zLlamaTokenizer.__init__   s   " &5%<rr/MWXacfMgMgvJyUDIIIImv	MWXacfMgMgvJyUDIIIImv	MWXacfMgMgvJyUDIIIImv	MWXacfMgMgvJyUDIIIImv	>/T^ / / /   F$**)B&..vzz+u/M/MNN 0 	
'' 0)E&?*G-	
 	
 	
 	
 	
 	
 	
    c                 t    t          | j                            t          | j                                      S N)lenr0   encoder*   r   r3   s    r5   unk_token_lengthzLlamaTokenizer.unk_token_length   s*    4=''DN(;(;<<===r6   c                 (   t          j        di | j        }| j        s|r|                    | j                   |S t          | j        d          5 }|                                }t          d| j	        j
         d          }|j                            |          }|                                }d|_        |j                            |           |                                }|                    |           d d d            n# 1 swxY w Y   |S )NrbzThe new behaviour of z (with `self.legacy = False`)Fr(   )spmSentencePieceProcessorr   r&   Loadr   openreadr   r-   __name__
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)r3   r   	tokenizerfr0   	model_pb2modelrI   s           r5   r.   z LlamaTokenizer.get_spm_processor   sH   .FF1EFF	; 	) 	NN4?+++$/4(( 	8AvvxxH'(v@W(v(v(vwwI(33H==E'6688O/4O,!++O<<<..00H--h777	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 s   B*DDDc                 z    | j                                         }d |d<   | j                                        |d<   |S )Nr0   sp_model_proto)__dict__copyr0   serialized_model_proto)r3   states     r5   __getstate__zLlamaTokenizer.__getstate__   s=    ""$$ j"&-"F"F"H"Hr6   c                     || _         t          j        di | j        | _        | j                            | j                   d S )Nr(   )rS   r?   r@   r   r0   rL   rR   )r3   ds     r5   __setstate__zLlamaTokenizer.__setstate__   sC    2JJT5IJJ--d.ABBBBBr6   c                 4    | j                                         S )zReturns vocab size)r0   get_piece_sizer;   s    r5   
vocab_sizezLlamaTokenizer.vocab_size   s     }++---r6   c                 |      fdt           j                  D             }|                     j                   |S )zReturns vocab as a dictc                 <    i | ]}                     |          |S r(   )convert_ids_to_tokens).0ir3   s     r5   
<dictcomp>z,LlamaTokenizer.get_vocab.<locals>.<dictcomp>   s)    RRRa++A..RRRr6   )ranger]   updateadded_tokens_encoder)r3   vocabs   ` r5   	get_vocabzLlamaTokenizer.get_vocab   s@    RRRR5;Q;QRRRT.///r6   textr   returnc                 v   | j         st          |          dk    r t                      j        |fi |S |                    t
          d          }| j        r
t
          |z   } t                      j        |fi |}t          |          dk    r*|d         t
          k    r|d         | j        v r
|dd         }|S )z
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        r       N)r&   r9   r1   tokenizereplaceSPIECE_UNDERLINEr'   all_special_tokens)r3   ri   r4   tokensr-   s       r5   rn   zLlamaTokenizer.tokenize   s    
 ; 	4#d))q..#577#D33F333||,c22  	+#d*D!!$11&11v;;??vay,<<<dNeAeAeABBZFr6   c                 .   | j         s|                    t          df          s!| j                            |t
                    S | j                            | j        |z   t
                    }t          |          | j        k    r|| j        d         n|S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        rl   )out_typeN)	r&   
startswithrp   r0   r:   r*   r   r9   r<   )r3   ri   r4   rr   s       r5   	_tokenizezLlamaTokenizer._tokenize   s     ; 	<doo/?.EFF 	<=''s';;; %%dnt&;c%JJ25f++AV2V2Vvd+--..\bbr6   c                 6    | j                             |          S )z0Converts a token (str) in an id using the vocab.)r0   piece_to_id)r3   tokens     r5   _convert_token_to_idz#LlamaTokenizer._convert_token_to_id  s    }((///r6   c                 :    | j                             |          }|S )z=Converts an index (integer) in a token (str) using the vocab.)r0   	IdToPiece)r3   indexry   s      r5   _convert_id_to_tokenz#LlamaTokenizer._convert_id_to_token  s    ''..r6   c                    |d                              t                    r| j        r|d         dd         |d<   g }d}d}t          |          D ]\  }}|| j        v r9|s|dk    r| j        r|dz  }|| j                            |          |z   z  }d}g }G|r,|dk    r&| j        r|                     t                    s|dz  }|                    |           d}|| j                            |          z  }|S )z:Converts a sequence of tokens (string) in a single string.r   rm   N Frl   T)	ru   rp   r'   	enumeraterq   r&   r0   decodeappend)r3   rr   current_sub_tokens
out_stringprev_is_specialrb   ry   s          r5   convert_tokens_to_stringz'LlamaTokenizer.convert_tokens_to_string  s6    !9 011 	&d6K 	&q	!""F1I
!&)) 	( 	(HAu///& &166dk6#%Jdm223EFFNN
"&%'""" &qAvv$2GvPUP`P`aqPrPrv#%J"))%000"'dm**+=>>>
r6   filename_prefixc                    t           j                            |          s t                              d| d           dS t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    r:t           j        	                    | j                  rt          | j        |           nzt           j        	                    | j                  sVt          |d          5 }| j                                        }|                    |           ddd           n# 1 swxY w Y   |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-r   r   wb)ospathisdirr+   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   rB   r0   rU   write)r3   save_directoryr   out_vocab_fileficontent_spiece_models         r5   save_vocabularyzLlamaTokenizer.save_vocabulary0  sy    w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNSUSZSaSabfbqSrSrNT_n555500 	/nd++ /r'+}'K'K'M'M$-.../ / / / / / / / / / / / / / /   s   (/E##E'*E'c                 t    | j         r| j        gng }| j        r| j        gng }||z   |z   }|||z   |z   |z   }|S r8   )r!   bos_token_idr"   eos_token_idr3   token_ids_0token_ids_1r   r   outputs         r5    build_inputs_with_special_tokensz/LlamaTokenizer.build_inputs_with_special_tokensK  s`    .2.@H)**b.2.@H)**b+l:"l*[8<GFr6   r   r   already_has_special_tokensc                    |r$t                                          ||d          S | j        rdgng }| j        rdgng }||dgt	          |          z  z   |z   S |dgt	          |          z  z   |z   |z   dgt	          |          z  z   |z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   rm   Nr   )r1   get_special_tokens_maskr!   r"   r9   )r3   r   r   r   r   r   r-   s         r5   r   z&LlamaTokenizer.get_special_tokens_maskV  s    $ & 	7722'[]a 3    #08ssb"08ssbA3[)9)9#9:\IIsS%%%'  sS%%%	'
 	
r6   c                     | j         r| j        gng }| j        r| j        gng }dgt	          ||z   |z             z  }||dgt	          ||z   |z             z  z  }|S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   Nrm   )r!   r   r"   r   r9   r   s         r5   $create_token_type_ids_from_sequencesz3LlamaTokenizer.create_token_type_ids_from_sequences{  s    . /3.@H)**b.2.@H)**bs<+5DEEE"qcC{ :\ IJJJJFr6   )r   r   r   NNTFFFFNT)Fr8   )NF)!rD   
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   r   r*   r   r2   propertyr<   r.   rW   rZ   r]   rh   r   rn   rv   rz   r~   r   r	   r   r   intboolr   r   __classcell__)r-   s   @r5   r   r   6   su       F FP *$&67
 48%*"'&+8
 8
 "$sCx.18
 8
 8
 8
 8
 8
t > > X>   "  C C C
 . . X.  [ tCy      &c c c$0 0 0  
  2! !x} !X]^aXb ! ! ! !6	 	 	 	 sx#
 #
9#
3;DI3F#
ko#
	c#
 #
 #
 #
 #
 #
L JN 93;DI3F	c       r6   r   )!r   r   shutilr   typingr   r   r   r   r   r	   sentencepiecer?   convert_slow_tokenizerr   tokenization_utilsr   r   utilsr   tokenization_utils_baser   
get_loggerrD   r+   r   rp   B_INSTE_INSTB_SYSE_SYSDEFAULT_SYSTEM_PROMPTr   r(   r6   r5   <module>r      s:  * & % 				       B B B B B B B B B B B B B B B B     5 5 5 5 5 5 A A A A A A A A        5444444		H	%	%!#45  $,u^ d d d d d( d d d d dr6   