
    g                         d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ddlmZ  G d de          Zd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                   *    e Zd ZdZ	 	 	 	 	 	 	 d deeeeeef         f                  deeeee	eef         e	eef         f         f                  d	eee
f         d
ededee         dee         f fdZededefd            Zdddgdg dfdeeee         f         dededeeee
f                  dedee         defdZdddgdg ddfdeee         eee                  f         dededeeee
f                  dedee         dedee         fdZ xZS )!SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>   ▁TFvocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc           	         |$|"t          t          |||||                    }nt          t          |||                    }|                    t          |                    #|                    t          |          g           t                      |_        |rdnd}	t          j        ||	          |_	        t          j        ||	          |_        d||||d}
t                                          ||
           d S )N)r   r   r   alwaysnever)r   prepend_schemeSentencePieceBPE)modelr   r   r   r   )r
   r   token_to_idstradd_special_tokensr   
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__)selfr   r   r   r   r   r   r   	tokenizerr    
parameters	__class__s              h/var/www/html/ai-engine/env/lib/python3.11/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr+   z"SentencePieceBPETokenizer.__init__   s    !3!#eVWPYdl"m"m"mnnII!#gU]"^"^"^__I  Y00<((#i..)9:::#vv	%5B7"0":{cq"r"r"r	$.;Wefff	 ("& 0
 

 	J/////    vocab_filenamemerges_filenamec                 N    t          j        | |          \  }}t          ||fi |S )N)r   	read_filer   )r2   r3   kwargsr   r   s        r0   	from_filez#SentencePieceBPETokenizer.from_file1   s/    noFFv(AA&AAAr1   i0u     i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc                     t          j        ||||||          }t          |t                    r|g}| j                            ||           dS )z%Train the model using the given filesr:   r;   r<   r=   r>   r?   )trainerN)r   
BpeTrainer
isinstancer$   
_tokenizertrain)	r,   r9   r:   r;   r<   r=   r>   r?   rB   s	            r0   rF   zSentencePieceBPETokenizer.train6   si     %!'))-'
 
 
 eS!! 	GEeW55555r1   iteratorlengthc	                 t    t          j        ||||||          }	| j                            ||	|           dS )z(Train the model using the given iteratorrA   )rB   rH   N)r   rC   rE   train_from_iterator)
r,   rG   r:   r;   r<   r=   r>   r?   rH   rB   s
             r0   rJ   z-SentencePieceBPETokenizer.train_from_iteratorN   sa     %!'))-'
 
 
 	++ 	, 	
 	
 	
 	
 	
r1   )NNr   r   TNF)__name__
__module____qualname____doc__r   r   r$   r   intr   r	   boolfloatr+   staticmethodr7   r   rF   r   rJ   __classcell__)r/   s   @r0   r   r   
   sl         7;OS,3 !%#'#(0 0c4S>1230 sDsCx%S/)I$JJKL0 j)	0
 0 0 %0 4.0 0 0 0 0 0B B# B B B B \B  8?y"&("6 6S$s)^$6 6 	6
 U3
?346 6 s)6 6 6 6 66  8?y"&(" $
 
x'>>?
 
 	

 U3
?34
 
 s)
 
 
 
 
 
 
 
 
 
r1   r   N)typingr   r   r   r   r   r   
tokenizersr	   r
   r   r   r   tokenizers.modelsr   tokenizers.normalizersr   base_tokenizerr   r    r1   r0   <module>rZ      s    ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? P P P P P P P P P P P P P P ! ! ! ! ! ! ' ' ' ' ' ' ) ) ) ) ) )]
 ]
 ]
 ]
 ]
 ]
 ]
 ]
 ]
 ]
r1   