
    gZ                         d dl mZmZmZmZmZmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZmZ ddlmZ  G d d	e          Zd
S )    )DictIteratorListOptionalTupleUnion   )
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)BertNormalizer	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                   Z    e Zd ZdZ	 	 	 	 	 	 	 	 	 d"deeeeeef         f                  deeeee	eef         e	eef         f         f                  d	eee
f         d
edee         dedee         dedef fdZededefd            Zdddgdg ddfdeeee         f         dededeeee
f                  dedee         d
ee         defdZdddgdg dddfdeee         eee                  f         dededeeee
f                  dedee         d
ee         ded ee         fd!Z xZS )#CharBPETokenizera  Original BPE Tokenizer

    Represents the BPE algorithm, as introduced by Rico Sennrich
    (https://arxiv.org/abs/1508.07909)

    The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
    Sennrich subword-nmt implementation by the following options that you can deactivate:
        - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
            * removing any control characters and replacing all whitespaces by the classic one.
            * handle chinese chars by putting spaces around them.
            * strip all accents.
        - spitting on punctuation in addition to whitespaces (deactivate it with
          `split_on_whitespace_only=True`)
    N<unk></w>FTvocabmerges	unk_tokensuffixdropout	lowercaseunicode_normalizerbert_normalizersplit_on_whitespace_onlyc
           
      V   |1|/t          t          |||t          |          |                    }
n,t          t          t          |          ||                    }
|
                    t          |                    #|
                    t          |          g           g }|r|t          |          gz  }|r|t          d          gz  }|r|t                      gz  }t          |          dk    r5t          |          dk    rt          |          |
_
        n|d         |
_
        |	rt          j                    |
_        nt          j                    |
_        t          j        |          |
_        d|||||||	d	}t%                                          |
|           d S )
N)r   r   end_of_word_suffix)r   r   r$   F)r   r   r   )r   r   )modelr   r   r   r   r    r!   r"   )r   r   strtoken_to_idadd_special_tokensr   r   r   lenr   
normalizerr   WhitespaceSplitpre_tokenizerBertPreTokenizerr   
BPEDecoderdecodersuper__init__)selfr   r   r   r   r   r   r    r!   r"   	tokenizernormalizers
parameters	__class__s                e/var/www/html/ai-engine/env/lib/python3.11/site-packages/tokenizers/implementations/char_level_bpe.pyr1   zCharBPETokenizer.__init__   s    !3!#!)nn'-   II "#Idj"k"k"kllI  Y00<((#i..)9:::  	M78JKKLLK 	=NU;;;<<K 	)IKK=(K {a;!##'/'<'<	$$'21~	$# 	H&4&D&F&FI##&4&E&G&GI#$/v>>>	 """4.(@	
 	

 	J/////    vocab_filenamemerges_filenamec                 N    t          j        | |          \  }}t          ||fi |S )N)r   	read_filer   )r9   r:   kwargsr   r   s        r7   	from_filezCharBPETokenizer.from_file\   s/    noFFvv88888r8   i0u  r	   i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc	           	          t          j        |||||||          }	t          |t                    r|g}| j                            ||	           dS )z%Train the model using the given filesr@   rA   rB   rC   rD   r$   rE   )trainerN)r   
BpeTrainer
isinstancer&   
_tokenizertrain)
r2   r?   r@   rA   rB   rC   rD   r   rE   rH   s
             r7   rL   zCharBPETokenizer.traina   sl     %!'))-%'
 
 
 eS!! 	GEeW55555r8   iteratorlengthc
           	      v    t          j        |||||||          }
| j                            ||
|	           dS )z(Train the model using the given iteratorrG   )rH   rN   N)r   rI   rK   train_from_iterator)r2   rM   r@   rA   rB   rC   rD   r   rE   rN   rH   s              r7   rP   z$CharBPETokenizer.train_from_iterator{   sd     %!'))-%'
 
 
 	++ 	, 	
 	
 	
 	
 	
r8   )	NNr   r   NFNTF)__name__
__module____qualname____doc__r   r   r&   r   intr   r
   floatboolr1   staticmethodr>   r   rL   r   rP   __classcell__)r6   s   @r7   r   r   	   s        " 7;OS,3#',0 $).A0 A0c4S>123A0 sDsCx%S/)I$JJKLA0 j)	A0
 A0 %A0 A0 %SMA0 A0 #'A0 A0 A0 A0 A0 A0F 9# 9 9 9 9 \9  8?y"&( &"6 6S$s)^$6 6 	6
 U3
?346 6 s)6 6 6 6 6 6:  8?y"&( &" $
 
x'>>?
 
 	

 U3
?34
 
 s)
 
 
 
 
 
 
 
 
 
 
r8   r   N)typingr   r   r   r   r   r    r
   r   r   r   r   modelsr   r4   r   r   r   r   base_tokenizerr   r    r8   r7   <module>r_      s    ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? H H H H H H H H H H H H H H       Z Z Z Z Z Z Z Z Z Z Z Z ) ) ) ) ) )M
 M
 M
 M
 M
} M
 M
 M
 M
 M
r8   