
    g                        d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ d	d
lmZ d	dlmZ d	dl m!Z! d	dl"m#Z# d	dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z. d	dl/m0Z0m1Z1m2Z2  e2j3        e4          Z5dZ6dZ7dZ8dZ9dZ:e%dz  Z%eeeedZ;e6e9dZ< e1e%           G d de*                      Z=dS )z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)AnyDictListOptionalTupleUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)convert_gguf_tokenizer)load_gguf_checkpoint)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiece)tokenizer_file
vocab_filec            )       f    e Zd ZU dZeZdZeed<    fdZ	e
defd            Ze
defd            Ze
defd            Zdeeef         fd	Ze
deeef         fd
            Ze
deeef         fd            Ze
deeef         fd            Zdeeef         fdZdefdZe
defd            Ze
defd            Z	 	 	 	 	 	 	 dEdedee         dee         dedededededeeee f         e!e         f         fdZ"de#ee!e         f         de#ee!e         f         fdZ$dedefdZ%d edee         fd!Z&dFd"e!e#eef                  defd#Z'dFd$edefd%Z(	 dFd&e#ee!e         f         d'ede#ee!e         f         fd(Z)dGd)ed$ee         d*ede!e         fd+Z*d,e+d-e,d.ed/ed0ee         d1ee         fd2Z-de+j.        e,j/        dd3ddddddddddddfd4e#e!e0         e!e1         e!e2         e!e3         f         d*ed,e+d-e,d.ee         d/ed5ed0ee         d1ee         d6ee         dee         dee         dededededed7ede4f&d8Z5dde+j.        e,j/        dd3ddddddddddddfd)e#e0e2f         d9ee#e0e2f                  d*ed,e+d-e,d.ee         d/ed5ed0ee         d1ee         d6ee         dee         dee         dededededed7ede4f(d:Z6de!e         defd;Z7	 	 dHd<e#ee!e         f         d'ed=edefd>Z8	 	 dId?e#ee9j:        f         d@ee         dAee         dBee         dee         f
dCZ;	 	 	 dJdDZ< xZ=S )KPreTrainedTokenizerFastaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    Nslow_tokenizer_classc                 	   |                     dd           }|                     dd           }|                     dd           }|                     dd           }|                     dd          }|                     di           }|r|| j        t          d          |t          j        |          }	nC||st          j        |          }	n)|rt          |          }	n|t          |	                    d	                    }
|
d
         d         }|
d         }|
d         }t          ||          \  }	}|                    |           t          |          dk    r|                    |           n| j        !|dur | j        |i |}t          |          }	n[|sJ|	                    d	d           | _        |	                    dg           | _        t          | d          }	d }nt          d          |	| _        ||                    |j                   d| _        | j        j        }| | j        j        d&i | |                    d|d                    |                    d|d                    |                    d|d                    |                    d|d                    n| j                                         | j        j        }| | j        j        d&i | |                    d|d                    |                    d|d                    |                    d|d                    |                    d|d                    |                    d|d                     t1                      j        d&i | | j        | j        _        d | j        D             fd t;          |                                d! "          D             t?          | j         !                                          d# D             z   fd$| j"        D             z  t                    dk    rg }| j#        }D ]}tI          |tJ                    r|j&        ptO          |          |v ntO          |          |v }tI          |tN                    rtK          ||%          }n||_&        |(                    |           |r| )                    |           d S d S d S )'Ntokenizer_object__slow_tokenizer	gguf_filer'   	from_slowFadded_tokens_decoderzCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.r(   config
model_type	tokenizertokenizer_configr   additional_special_tokensT)from_tiktokena9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofc                 F    h | ]}t          t          |                    S  hashrepr.0tokens     `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py	<setcomp>z3PreTrainedTokenizerFast.__init__.<locals>.<setcomp>   s&    $^$^$^5T$u++%6%6$^$^$^    c                 V    g | ]%\  }}t          t          |                    v#|&S rE   rF   )rJ   indexrK   added_tokens_decoder_hashs      rL   
<listcomp>z4PreTrainedTokenizerFast.__init__.<locals>.<listcomp>   sA     
 
 
uDKK  (AAA AAArN   c                     | d         S Nr   rE   )xs    rL   <lambda>z2PreTrainedTokenizerFast.__init__.<locals>.<lambda>   s    STUVSW rN   keyc                 ,    g | ]}t          |          S rE   )strrI   s     rL   rR   z4PreTrainedTokenizerFast.__init__.<locals>.<listcomp>   s    ;b;b;b5CJJ;b;b;brN   c                 $    g | ]}|v|v
|S rE   rE   )rJ   rK   encodertokens_to_adds     rL   rR   z4PreTrainedTokenizerFast.__init__.<locals>.<listcomp>   s7     
 
 
5PWCWCW\aiv\v\vE\v\v\vrN   )specialrE   )*popr+   
ValueErrorcopydeepcopyTokenizerFast	from_filer   r   getr   updatelenr(   r6   
_tokenizerinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init__split_special_tokensencode_special_tokensr1   sorteditemslistadded_tokens_encoderkeysall_special_tokens_extendedall_special_tokens
isinstancer   r^   rZ   append
add_tokens)selfargskwargsr-   slow_tokenizerr/   fast_tokenizer_filer0   r1   fast_tokenizer
gguf_paramarchitecturetokenizer_dictr5   additional_kwargs_truncation_paddingtokensspecial_tokensrK   
is_specialrQ   r\   r]   	__class__s                        @@@rL   rr   z PreTrainedTokenizerFast.__init__b   s   !::&8$??$6==JJ{D11	$jj)94@@JJ{E22	%zz*@"EE 	/D4M4U0  
 '!]+;<<NN ,Y,*45HIINN 	3NCCNN"-fjj.F.FGGJ%h/=L'4N)*<=0F|Uc0d0d-N-MM*+++$%%))/000&2~U7R7R6T6GGGN3NCCNN 	$jjt<<DO-3ZZ8SUW-X-XD*3DMMMN!NNr   )%MM.4555,1)o0"-DO-<<<<<lK,EFFF/[1IJJJhH(=>>>3[5LMMMMO))+++?**DO*66X666k8K+@AAA18M3JKKKnh{.CDDDlHX,>???2H=Q4RSSS 	""6"""040I-$^$^DD]$^$^$^!
 
 
 
 &';'A'A'C'C X X X
 
 

 t0557788;b;bTa;b;b;bb 
 
 
 
 
#?
 
 
 	
 }!!F!4N& 
% 
% "%446U]Bc%jjN&BU~5 
 eS)) /&ujAAAEE$.EMe$$$$ (''''' "!( (rN   returnc                     dS )NTrE   r   s    rL   is_fastzPreTrainedTokenizerFast.is_fast   s    trN   c                     dS )z
        `bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        TrE   r   s    rL   can_save_slow_tokenizerz/PreTrainedTokenizerFast.can_save_slow_tokenizer   s	     trN   c                 8    | j                             d          S )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensrh   get_vocab_sizer   s    rL   
vocab_sizez"PreTrainedTokenizerFast.vocab_size   s    
 ---FFFrN   c                 8    | j                             d          S )NTr   )rh   	get_vocabr   s    rL   r   z!PreTrainedTokenizerFast.get_vocab   s    ((4(@@@rN   c                 *    |                                  S N)r   r   s    rL   vocabzPreTrainedTokenizerFast.vocab   s    ~~rN   c                 h    d t          | j                                        d           D             S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 $    i | ]\  }}|j         |S rE   contentrJ   vks      rL   
<dictcomp>z@PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<dictcomp>        mmmA	1mmmrN   c                     | d         S rT   rE   items    rL   rV   z>PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>       dhijdk rN   rW   ru   r1   rv   r   s    rL   rx   z,PreTrainedTokenizerFast.added_tokens_encoder   s9     nm0I0O0O0Q0QWkWk)l)l)lmmmmrN   c                 4    | j                                         S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `Dict[str, int]`: The added tokens.
        )rh   get_added_tokens_decoderr   s    rL   r1   z,PreTrainedTokenizerFast.added_tokens_decoder   s     77999rN   c                 h    d t          | j                                        d           D             S )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `Dict[str, int]`: The added tokens.
        c                 $    i | ]\  }}|j         |S rE   r   r   s      rL   r   z;PreTrainedTokenizerFast.get_added_vocab.<locals>.<dictcomp>  r   rN   c                     | d         S rT   rE   r   s    rL   rV   z9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>  r   rN   rW   r   r   s    rL   get_added_vocabz'PreTrainedTokenizerFast.get_added_vocab   s9     nm0I0O0O0Q0QWkWk)l)l)lmmmmrN   c                 8    | j                             d          S )zD
        Size of the full vocabulary with the added tokens.
        Tr   r   r   s    rL   __len__zPreTrainedTokenizerFast.__len__  s     ---EEErN   c                     | j         S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )rh   r   s    rL   backend_tokenizerz)PreTrainedTokenizerFast.backend_tokenizer  s    
 rN   c                     | j         j        S )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )rh   decoderr   s    rL   r   zPreTrainedTokenizerFast.decoder  s    
 &&rN   FTencodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 F   |	d| j         v }|	d| j         v }|r|j        |g|j        z   }	n|g}	t          t                    }
|	D ]}|
d                             |j                   |r |
d                             |j                   |r |
d                             |j                   |r |
d                             |j                   |r |
d                             |j	                   |r-|
d                             t          |j                             |
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        Ntoken_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingrB   )model_input_namesoverflowingr   rw   r}   idstype_idsr   r   offsetsrg   )r   r   r   r   r   r   r   r   r   	encodingsencoding_dictes               rL   _convert_encodingz)PreTrainedTokenizerFast._convert_encoding  sV   ( !($48N$N! ($48N$N!$ 	#)=)I!
X%99II!
I#D)) 	; 	;A+&--ae444$ C./66qzBBB$ I./66q7GHHH) S34;;A<QRRR% B./66qyAAA ;h'..s15zz:::i''rN   r   c                 |     |dS t          |t                    r                     |          S  fd|D             S )aT  
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        Nc                 :    g | ]}                     |          S rE   )#_convert_token_to_id_with_added_voc)rJ   rK   r   s     rL   rR   zAPreTrainedTokenizerFast.convert_tokens_to_ids.<locals>.<listcomp>Z  s'    TTTE88??TTTrN   )r|   rZ   r   r   r   s   ` rL   convert_tokens_to_idsz-PreTrainedTokenizerFast.convert_tokens_to_idsI  sP     >4fc"" 	D;;FCCCTTTTVTTTTrN   rK   c                 L    | j                             |          }|| j        S |S r   )rh   token_to_idunk_token_id)r   rK   rP   s      rL   r   z;PreTrainedTokenizerFast._convert_token_to_id_with_added_voc\  s*    ++E22=$$rN   rP   c                 P    | j                             t          |                    S r   )rh   id_to_tokenint)r   rP   s     rL   _convert_id_to_tokenz,PreTrainedTokenizerFast._convert_id_to_tokenb  s    **3u::666rN   
new_tokensc                 n    |r| j                             |          S | j                             |          S r   )rh   add_special_tokensr~   )r   r   r   s      rL   _add_tokensz#PreTrainedTokenizerFast._add_tokense  s7     	B?55jAAA))*555rN   pairc                 6    | j                             |          S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )rh   num_special_tokens_to_add)r   r   s     rL   r   z1PreTrainedTokenizerFast.num_special_tokens_to_addk  s    & 88>>>rN   r   skip_special_tokensc                    t          |t                    r| j                            |          S g }|D ]J}t          |          }|r
|| j        v r|                    | j                            |                     K|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        )r|   r   rh   r   all_special_idsr}   )r   r   r   r   rP   s        rL   convert_ids_to_tokensz-PreTrainedTokenizerFast.convert_ids_to_tokens  s      c3 	4?..s333 	> 	>EJJE" u0D'D'DMM$/55e<<====rN   textr   c                 H     | j         d|||d|                                S )N)r   	text_pairr   rE   )encode_plusr   )r   r   r   r   r   s        rL   tokenizez PreTrainedTokenizerFast.tokenize  s2    tkTTN`kkdjkkrrtttrN   padding_strategyr<   r8   r;   rC   rA   c                    | j         j        | j         j        }|t          j        k    r| j                                          n<|||j        | j        d}d}	nfd|D             }	|	|k    r | j         j        di | |t          j
        k    r|| j                                          dS dS |t          j        k    r|nd}
|
||n| j        | j        | j        | j        |d}||k    r | j         j        di | dS dS )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r8   r;   r=   r:   c                 >    i | ]}|                     |d           S r   re   )rJ   r   r   s     rL   r   zFPreTrainedTokenizerFast.set_truncation_and_padding.<locals>.<dictcomp>  s)    GGG11kooa66GGGrN   )rB   r:   pad_idr>   r@   rC   rE   )rh   rk   ro   r   DO_NOT_TRUNCATErn   valuer9   rl   r    
DO_NOT_PAD
no_padding
MAX_LENGTHrA   pad_token_idr>   r?   rp   )r   r   r<   r8   r;   rC   rA   r   targetcurrentrB   r   s              @rL   set_truncation_and_paddingz2PreTrainedTokenizerFast.set_truncation_and_padding  si   B o0?*"4"DDD&--/// ) /5!1	 F "GGGGGGG&  11;;F;;;999#**,,,,, $# $47Q#Q#QZZW[F -9-E\\4K\+!^#5&8 F 6!!..8888888 "!rN   r   batch_text_or_text_pairsis_split_into_wordsreturn_tensorsrs   c                     t          |t          t          f          s t          dt	          |           d                               ||||||	            j        j        |k    r| j        _         j                            |||          } fd|D             }i }|d         d         	                                D ]fd|D             }||<   d |D             }r;g }t          |          D ]$\  }\  }}||gt          |d	                   z  z  }%||d
<   |d	         D ]}                     ||           t          |||
          S )Nz:batch_text_or_text_pairs has to be a list or a tuple (got ))r   r<   r8   r;   rC   rA   )r   is_pretokenizedc                 J    g | ]}                     |	            S ))r   r   r   r   r   r   r   r   )r   )
rJ   r   r   r   r   r   r   r   r   r   s
     rL   rR   z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>  sX      
  
  
  ""!&;&;*C+E'=+ # 	 	 
  
  
rN   r   c                 0    g | ]\  }}|         D ]}|S rE   rE   )rJ   r   _r   rX   s       rL   rR   z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>3  s.    NNN74DINNqQNNNNrN   c                 "    g | ]\  }}|D ]}|S rE   rE   )rJ   r  r   r   s       rL   rR   z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>5  s)    SSSWQdSSqSSSSrN   r   overflow_to_sample_mapping)tensor_type)r|   tuplerw   	TypeErrortyper   rh   rt   encode_batchry   	enumeraterg   &_eventual_warn_about_too_long_sequencer   )r   r   r   r   r<   r8   r;   r   rC   rA   r   r   r   r   r   r   r   r   rs   r   tokens_and_encodingssanitized_tokensstacksanitized_encodingsr  itoksr  r   rX   s   `          ```````           @rL   _batch_encode_plusz*PreTrainedTokenizerFast._batch_encode_plus  s"   . 2UDMBB 	nTRjMkMknnn  
 	''- 3!1% 	( 	
 	
 	
 ?04HHH4HDO1O00$1/ 1 
 
	 
  
  
  
  
  
  
  
  
  
  
 & 
  
  
( '*1-2244 	* 	*CNNNN&:NNNE$)S!!SS0DSSS % 	X)+& )*> ? ? K K9D!*qcC[8I4J4J.JJ**=W9:)+6 	X 	XI77	:wWWWW-/BP^____rN   r   c                 @   |r||fgn|g} | j         |fi d|d|d|d|d|d|d|	d|
d	|d
|d|d|d|d|d|d|d||}|3|s1t          d |                                D             |j                  }|                     |d         ||           |S )Nr   r   r   r<   r8   r;   rC   rA   r   r   r   r   r   r   r   r   rs   c                     i | ]>\  }}|t          |          d k    r#t          |d          t                    r|d          n|?S )r   )rg   r|   rw   )rJ   rX   r   s      rL   r   z8PreTrainedTokenizerFast._encode_plus.<locals>.<dictcomp>u  sW       "U SZZ!^^
58T8R8R^qX]  rN   r   )r  r   rv   r   r  )r   r   r   r   r   r<   r8   r;   r   rC   rA   r   r   r   r   r   r   r   r   rs   r   batched_inputbatched_outputs                          rL   _encode_plusz$PreTrainedTokenizerFast._encode_plusC  s   . 09D$	*++tf00
 
 
 3 3
  21
 .-	

 !4 3
 "z
 6
  21
 &
 *>
 #8"7
 #8"7
 '@&?
 (B'A
 $:#9
  (-!
" G#
$ "6!5'
 
0 !*C!* &4&:&:&<&<   ( N 	33N;4OQ[]deeerN   c                 @    | j         j                            |          S r   )r   r   decoder   s     rL   convert_tokens_to_stringz0PreTrainedTokenizerFast.convert_tokens_to_string  s    %-44V<<<rN   	token_idsclean_up_tokenization_spacesc                     |                     dd          | _        t          |t                    r|g}| j                            ||          }||n| j        }|r|                     |          }|S |S )Nuse_source_tokenizerF)r   )r_   rj   r|   r   rh   r  r  clean_up_tokenization)r   r  r   r  r   r   
clean_texts          rL   _decodezPreTrainedTokenizerFast._decode  s     -3JJ7Mu,U,U)i%% 	$"I%%iEX%YY ,7 )(2 	%
 ( 	33D99JKrN   save_directory
file_nameslegacy_formatfilename_prefixc                     t          |          } j        |du rt          d          |du s|du o j        duo j        }|du p|du }|rt          j                            ||r|dz   ndt          z             } fd j        	                                D             }|rZt          |dd	
          5 }	t          j        |ddd          dz   }
|	                    |
           ddd           n# 1 swxY w Y                        ||          }||z   |fz   }|rOt          j                            ||r|dz   ndt          z             } j                            |           ||fz   }|S )z
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- c                 2    i | ]\  }}|j         k    ||S rE   )r   )rJ   tokrP   r   s      rL   r   z<PreTrainedTokenizerFast._save_pretrained.<locals>.<dictcomp>  s.    vvv*#u]bfjfu]u]u3]u]u]urN   wzutf-8)r      )indent	sort_keysensure_ascii
)r&  )rZ   r+   r`   r   ospathjoinADDED_TOKENS_FILErx   rv   openjsondumpswritesave_vocabularyTOKENIZER_FILEr   save)r   r#  r$  r%  r&  	save_slow	save_fastadded_tokens_fileadded_vocabfout_strvocab_filesr'   s   `            rL   _save_pretrainedz(PreTrainedTokenizerFast._save_pretrained  s    ^,,$,$1F1F`   d";mt&; -)5-, 	
 "T)C]e-C	 	I "/!Q3!6!6rUf f! ! wvvv8Q8W8W8Y8YvvvK %+S7CCC %q"jQ$]bcccfjjGGGG$$$% % % % % % % % % % % % % % % ..~.__K#k15F4HHJ 	8W\\/!Q3!6!6rUc c N "''777#~&77Js   51C22C69C6c           	      f   t          j        | j                                                  }|                    d          }|                    d          }	d}
|d         d         dk    ri |d         d<   g |d         d<   n|d         d         d	k    r^|d         d
         O|d         d
         }|d         d         |         d         }
|
v r|
         }
d|d         d
<   |
dgg|d         d<   n;|d         d         dv ri |d         d<   nt          d|d         d          d          7d|d         v r-|d         d         v r|d         d                  |d         d<   t          j        t          j        |                    g }|D ]}|                    dd          }|                    dd          }|d         d         d	k    r|sC|d         v r|d                  |d<   |	                    t          d*i |           ||                    |           |d         d         dk    r#d|vr|d         d         |d         d         |d<   |d         d         dk    r#d|vr|d         d         |d         d         |d<   |d         d         d	k    r|
|
|d<   |d         |d         d         dk    sA|d         d         dk    rPd|d         v rFt          d |d         d         D                       r!t          j                                        |d<   t           |d         d                  } |d*||d|}                    |||           |	,t          j                                                  }d|	v r|	d         D ]}|	d         |         d         }fd |D             }||	d         |         d<   |D ](}                    |          }|t          d!          )fd"|D             |	d         |         d#<   d$D ]L}||	v rF|	|         \  }}|v r|         }                    |          }|t          d!          ||g|	|<   M|	|d<   t          j        t          j        |                    | j                                        }t*          j                                        }|                    d%           |D ]}t1          | d&|           xt1          | |          }|v r|         }t1          | d&|           }t3          |t                    r-t          ||j        |j        |j        |j        d'(          ||<   |||<   | j        }||                    |           t?          |          dk    r||d%<    | j         d*d)i|S )+uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `List[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`Dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokenspost_processorNmodelr	  r#   r   mergesr$   unk_idr   g        )r%   r&   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenr^   idr   continuing_subword_prefixend_of_word_suffixpre_tokenizer	ByteLevelSequencepretokenizersc              3   .   K   | ]}|d          dk    V  dS )r	  rP  NrE   )rJ   pretokenizers     rL   	<genexpr>zBPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<genexpr>4  s@          V$3     rN   initial_alphabet)r   r   )rB   trainerr   r   c                 <    g | ]}                     ||          S rE   r   )rJ   rK   special_tokens_maps     rL   rR   zCPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<listcomp>F  s*    ![![![5"4"8"8"F"F![![![rN   zQAttempted to set a token in the post processor that does not exist in the mappingc                 :    g | ]}                     |          S rE   )r   )rJ   rK   r4   s     rL   rR   zCPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<listcomp>O  s)    CuCuCuejIDYDYZ_D`D`CuCuCurN   r   )clssepr6   r  T)single_wordlstriprstrip
normalizedr^   r-   rE   )!r7  loadsrh   to_strr_   r`   rc   from_strr8  r}   r   extendanypre_tokenizers_fastrP  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorr   ri   ra   r   SPECIAL_TOKENS_ATTRIBUTESremovegetattrr|   r]  r^  r_  r`  r6   rg   r   )r   text_iteratorr   rB   new_special_tokensrY  r   tokenizer_jsonrF  rG  rK  rJ  r   added_tokenr^   r  trainer_classrW  trained_tokenizer_jsonrX   r   rK   token_idspecial_tokenspecial_tokens_listspecial_token_fullr6   r4   s        `                     @rL   train_new_from_iteratorz/PreTrainedTokenizerFast.train_new_from_iterator  s=   D DO$:$:$<$<==%)).99'++,<==	'"6*e33/1N7#G,02N7#H--G$V,	99g&x0<'0:*73G<VDQG	%1iCU6U6U 29 =I45w'15>4D3Ew'0G$V,0JJJ/1N7#G,,>n]dNeflNm > > >   *~g666w'48JJJ3EnU\F]^iFj3kN7#K0!*4:n+E+EFF	 ' 	= 	=K!ooi66Gd++Ag&v.);;G;!-+i2HL^2^2^);K	<R)SI&!!*";";{";";<<<<)!!"4555 7#F+u44+699w'(CDP2@2IJe2fF./7#F+u44$F22w'(<=I+9'+BCW+XF'('"6*i77I<Q"+F;?+7/7;FFo.v6*DD>/#BBB  $2?$CO$T     C *=)F)O)O)Q)QF%&01H1PQ-_:n__X^__%%mFG%TTT%%)Z	0@0@0B0B%C%C">11)*:; v vC+,<=cB8LF)5![![![![TZ![![![FLN#34S9(C!'  #,#8#8#?#?#+", s# #  ,
 DvCuCuCuntCuCuCuN#34S9%@@!/ 
F 
F N22-m<HE1)5%CU:U:U 25 9(44U;;H'(o   6;H4EN=17E"#34%.tz:P/Q/QRRI!&&((0JOOQQ""#>???( 	2 	2Et[[[))5 'e 4 4%1mGY6Y6Y$6}$EM%,T;u;;%?%?"0*== 2$.%$6$B1818#5#@ $% % %F5MM %2F5M$($B!)%,,-?@@@())A--2KF./t~CCyCFCCCrN   )NNFFFFT)F)NF)FN)NN)NNN)>__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesr+   r   __annotations__rr   propertyboolr   r   r   r   r   rZ   r   r   rx   r   r1   r   r   rc   r   DecoderFastr   EncodingFastr   r   r   r   r   r	   r   r   r   r   r   r   r   r    r   r   r   r   r   r   r   r   r   r  r  r  r"  r2  PathLikerD  rw  __classcell__)r   s   @rL   r*   r*   Q   s        
 
 *04-444m( m( m( m( m(^     X     X GC G G G XGA4S> A A A A  tCH~       X  nd38n n n n Xn :d3
?&; : : : X:nc3h n n n nF F F F F =    X ' ' ' ' X' 1504*/+0',#-( -(-(  (~-(  (~	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-( -( -( -(^UE#tCy.,A UeCQUVYQZNF[ U U U U&     7# 7(3- 7 7 7 76 6d5j+A&B 6]` 6 6 6 6? ?d ?s ? ? ? ?, GL d3i(?C	sDI~	   4u uS u uRV umqrumv u u u uI9)I9 0I9 	I9
 I9 %SMI9 tnI9 I9 I9 I9` $(,;,F2D2T$($),0'+(,0404*/+0',#%*+Y` Y`"'OT-0$7H2I4PeKff#
Y`
 !Y` *Y` 0Y` SMY` Y` "Y` %SMY` tnY` !Y`  (~Y`  (~Y`  $(!Y`" %)#Y`$ !%%Y`& 'Y`( )Y`* #+Y`, 
-Y` Y` Y` Y`| DH#',;,F2D2T$($),0'+)-0404*/+0',#%*); ;I001; E)->">?@; !	;
 *; 0; SM; ; "; %SM; tn; !;  (~;  (~; $(;  %)!;" !%#;$ %;& ';( #);, 
-; ; ; ;z=tCy =S = = = = %*-1	 d3i( " '+	 
   8 )-)-/ /c2;.// #J/  ~	/
 "#/ 
s/ / / /j sD sD sD sD sD sD sD sDrN   r*   )>r{  ra   r7  r2  collectionsr   typingr   r   r   r   r   r	   tokenizers.pre_tokenizerspre_tokenizersrf  
tokenizersr
   r  r   rc   tokenizers.decodersr   r  tokenizers.trainersr   r   r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utilsr   tokenization_utils_baser   r   r   r   r   r   r   r   r   r   utilsr    r!   r"   
get_loggerrx  loggerr;  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEr5  rh  r|  r*   rE   rN   rL   <module>r     s   
   				 # # # # # # : : : : : : : : : : : : : : : : 7 7 7 7 7 7 / / / / / / 1 1 1 1 1 1 6 6 6 6 6 6 ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ : : : : : : 5 5 5 5 5 5 = = = = = = 3 3 3 3 3 3                        @ ? ? ? ? ? ? ? ? ? 
	H	%	% "3 / '  (      !!	   (6EXYY  ,--mD mD mD mD mD5 mD mD .-mD mD mDrN   