
    g                         d dl mZmZmZmZmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ  G d	 d
e          ZdS )    )DictIteratorListOptionalUnion)
AddedToken	Tokenizerdecoderstrainers)	WordPiece)BertNormalizer)BertPreTokenizer)BertProcessing   )BaseTokenizerc                   6    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 d&d
eeeeeef         f                  deee	f         deee	f         deee	f         deee	f         deee	f         de
de
dee
         de
def fdZed
efd            Zdddg g ddd	fdeeee         f         dedededee         d eeee	f                  d!e
defd"Zdddg g ddd	dfd#eee         eee                  f         dedededee         d eeee	f                  d!e
ded$ee         fd%Z xZS )'BertWordPieceTokenizerzBert WordPiece TokenizerN[UNK][SEP][CLS][PAD][MASK]T##vocab	unk_token	sep_token	cls_token	pad_token
mask_token
clean_texthandle_chinese_charsstrip_accents	lowercasewordpieces_prefixc                    |,t          t          |t          |                              }n*t          t          t          |                              }|                    t          |                    #|                    t          |          g           |                    t          |                    #|                    t          |          g           |                    t          |                    #|                    t          |          g           |                    t          |                    #|                    t          |          g           |                    t          |                    #|                    t          |          g           t          |||	|
          |_        t                      |_        ||                    t          |                    }|t          d          |                    t          |                    }|t          d          t          t          |          |ft          |          |f          |_        t          j        |          |_        d||||||||	|
|d}t                                          ||           d S )N)r   )r    r!   r"   r#   z%sep_token not found in the vocabularyz%cls_token not found in the vocabulary)prefixBertWordPiece)modelr   r   r   r   r   r    r!   r"   r#   r$   )r	   r   strtoken_to_idadd_special_tokensr   
normalizerr   pre_tokenizer	TypeErrorr   post_processorr
   decodersuper__init__)selfr   r   r   r   r   r   r    r!   r"   r#   r$   	tokenizersep_token_idcls_token_id
parameters	__class__s                   e/var/www/html/ai-engine/env/lib/python3.11/site-packages/tokenizers/implementations/bert_wordpiece.pyr2   zBertWordPieceTokenizer.__init__   s    !)ES^^"L"L"LMMII!)c)nn"E"E"EFFI   Y00<((#i..)9:::  Y00<((#i..)9:::  Y00<((#i..)9:::  Y00<((#i..)9:::  Z11=((#j//):;;;-!!5'	 
  
  
	 #3"4"4	$00Y@@L# GHHH$00Y@@L# GHHH'5s9~~|6TWZ[dWeWegsVt'u'uI$$.6GHHH	 %""""$$$8*"!2
 

 	J/////    c                 D    t          j        |           } t          | fi |S )N)r   	read_filer   )r   kwargss     r9   	from_filez BertWordPieceTokenizer.from_fileQ   s(    #E**%e66v666r:   i0u     i  )r   r   r   r   r   files
vocab_sizemin_frequencylimit_alphabetinitial_alphabetspecial_tokensshow_progressc	           	          t          j        |||||||          }	t          |t                    r|g}| j                            ||	           dS )z%Train the model using the given filesrA   rB   rC   rD   rE   rF   continuing_subword_prefix)trainerN)r   WordPieceTrainer
isinstancer)   
_tokenizertrain)
r3   r@   rA   rB   rC   rD   rE   rF   r$   rJ   s
             r9   rN   zBertWordPieceTokenizer.trainV   sl    & +!')-)'&7
 
 
 eS!! 	GEeW55555r:   iteratorlengthc
           	      v    t          j        |||||||          }
| j                            ||
|	           dS )z(Train the model using the given iteratorrH   )rJ   rP   N)r   rK   rM   train_from_iterator)r3   rO   rA   rB   rC   rD   rE   rF   r$   rP   rJ   s              r9   rR   z*BertWordPieceTokenizer.train_from_iteratorv   sd    ( +!')-)'&7
 
 
 	++ 	, 	
 	
 	
 	
 	
r:   )Nr   r   r   r   r   TTNTr   )__name__
__module____qualname____doc__r   r   r)   r   intr   boolr2   staticmethodr>   r   rN   r   rR   __classcell__)r8   s   @r9   r   r      s       "" 7;,3,3,3,3-5%)(,!%@0 @0c4S>123@0 j)@0 j)	@0
 j)@0 j)@0 #z/*@0 @0 #@0  ~@0 @0 @0 @0 @0 @0 @0 @0D 7 7 7 7 \7  "&(8
 8
 8
 #!%6 6S$s)^$6 6 	6
 6 s)6 U3
?346 6 6 6 6 6F  "&(8
 8
 8
 #!% $!!
 !
x'>>?!
 !
 	!

 !
 s)!
 U3
?34!
 !
 !
  !!
 !
 !
 !
 !
 !
 !
 !
r:   r   N)typingr   r   r   r   r   
tokenizersr   r	   r
   r   tokenizers.modelsr   tokenizers.normalizersr   tokenizers.pre_tokenizersr   tokenizers.processorsr   base_tokenizerr   r    r:   r9   <module>rc      s    8 8 8 8 8 8 8 8 8 8 8 8 8 8 @ @ @ @ @ @ @ @ @ @ @ @ ' ' ' ' ' ' 1 1 1 1 1 1 6 6 6 6 6 6 0 0 0 0 0 0 ) ) ) ) ) )K
 K
 K
 K
 K
] K
 K
 K
 K
 K
r:   