
    g                         d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZmZ ddlmZ  G d de          Zd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizers
processorstrainers)BPE)	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                   
    e Zd ZdZ	 	 	 	 	 	 	 	 	 ddeeeeeef         f                  deeeee	eef         e	eef         f         f                  de
de
dee         d	ee         d
ee         dee         de
f fdZededefd            Zdddg fdeeee         f         dedede
deeeef                  f
dZdddg dfdeee         eee                  f         dedede
deeeef                  dee         fdZ xZS )ByteLevelBPETokenizerzjByteLevelBPETokenizer

    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
    NFvocabmergesadd_prefix_space	lowercasedropoutunicode_normalizercontinuing_subword_prefixend_of_word_suffixtrim_offsetsc
           	      j   |(|&t          t          ||||pd|pd                    }
nt          t                                }
g }|r|t          |          gz  }|r|t                      gz  }t	          |          dk    r5t	          |          dk    rt          |          |
_        n|d         |
_        t          j        |          |
_	        t          j                    |
_        t          j        |	          |
_        d|||||||	d}t                                          |
|           d S )	N )r   r   r   r   r   )r   )r   ByteLevelBPE)modelr   r   r   r   r   r   r   )r
   r   r   r   lenr   
normalizerr   	ByteLevelpre_tokenizerr   decoderr   post_processorsuper__init__)selfr   r   r   r   r   r   r   r   r   	tokenizernormalizers
parameters	__class__s                e/var/www/html/ai-engine/env/lib/python3.11/site-packages/tokenizers/implementations/byte_level_bpe.pyr+   zByteLevelBPETokenizer.__init__   sk    !3!#.G.M2'9'?R   II "#%%((I  	M78JKKLLK 	)IKK=(K {a;!##'/'<'<	$$'21~	$"0":L\"]"]"]	$.00	#-#7\#R#R#R	  $ 0""4)B"4(	
 	

 	J/////    vocab_filenamemerges_filenamec                 N    t          j        | |          \  }}t          ||fi |S )N)r   	read_filer   )r3   r4   kwargsr   r   s        r1   	from_filezByteLevelBPETokenizer.from_fileJ   s/    noFFv$UF==f===r2   i0u     Tfiles
vocab_sizemin_frequencyshow_progressspecial_tokensc                     t          j        ||||t          j                                                  }t          |t                    r|g}| j                            ||           dS )z%Train the model using the given filesr;   r<   r=   r>   initial_alphabet)trainerN)	r   
BpeTrainerr   r&   alphabet
isinstancestr
_tokenizertrain)r,   r:   r;   r<   r=   r>   rB   s          r1   rH   zByteLevelBPETokenizer.trainO   su     %!'')+5>>@@
 
 
 eS!! 	GEeW55555r2   iteratorlengthc                     t          j        ||||t          j                                                  }| j                            |||           dS )z(Train the model using the given iteratorr@   )rB   rJ   N)r   rC   r   r&   rD   rG   train_from_iterator)r,   rI   r;   r<   r=   r>   rJ   rB   s           r1   rL   z)ByteLevelBPETokenizer.train_from_iteratord   sm     %!'')+5>>@@
 
 
 	++ 	, 	
 	
 	
 	
 	
r2   )	NNFFNNNNF)__name__
__module____qualname____doc__r   r   rF   r   intr   boolfloatr+   staticmethodr8   r   r	   rH   r   rL   __classcell__)r0   s   @r1   r   r   
   s?         7;OS!&#',037,0"80 80c4S>12380 sDsCx%S/)I$JJKL80 	80
 80 %80 %SM80 $,C=80 %SM80 80 80 80 80 80 80t ># > > > > \>  "796 6S$s)^$6 6 	6
 6 U3
?346 6 6 60  "79 $
 
x'>>?
 
 	

 
 U3
?34
 
 
 
 
 
 
 
 
r2   r   N)typingr   r   r   r   r   r   
tokenizersr	   r
   r   r   r   r   tokenizers.modelsr   tokenizers.normalizersr   r   r   base_tokenizerr   r    r2   r1   <module>r\      s    ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ! ! ! ! ! ! S S S S S S S S S S ) ) ) ) ) )p
 p
 p
 p
 p
M p
 p
 p
 p
 p
r2   