
    g                        d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZ  ej        e          ZddZdede fdZ!d Z" G d d          Z# G d de#          Z$de defdZ% G d d          Z& G d de&          Z' G d de&          Z( G d de&          Z) G d de&          Z* G d  d!e&          Z+ G d" d#e&          Z, G d$ d%e&          Z- G d& d'e&          Z. G d( d)e&          Z/ G d* d+e&          Z0 G d, d-e&          Z1 G d. d/e&          Z2 G d0 d1e2          Z3 G d2 d3e2          Z4 G d4 d5e2          Z5 G d6 d7e2          Z6 G d8 d9e2          Z7 G d: d;e2          Z8 G d< d=e2          Z9 G d> d?e2          Z: G d@ dAe2          Z; G dB dCe2          Z< G dD dEe2          Z= G dF dGe2          Z> G dH dIe2          Z? G dJ dKe2          Z@ G dL dMe2          ZA G dN dOe2          ZB G dP dQe&          ZC G dR dSe2          ZD G dT dUe&          ZE G dV dWe&          ZF G dX dYe&          ZG G dZ d[e2          ZH G d\ d]e2          ZI G d^ d_e2          ZJ G d` dae&          ZK G db dce2          ZLdd ZM G de df          ZNi dge3dhe/die4dje'dkeDdleGdme5dneEdoe,dpe'dqe1dre6dse'dte'due'dve'dwe'i dxe3dye)dze,d{e-d|e'd}e'd~e/de;de/de/de'deKde7de8de*de'de/i de9de+de@de.de'de=de>de'de/de0de:de'deAdeBdeCde;de<e(eHeJeJeIeJdZOddefdZPdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)DictListTuple)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR c                    t                      rddlm} |S t                      rGdd l}t          j        |j        j                  t          j        d          k     rddl	m} nddl	m
} |S t          t          j        |                     )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   googles      _/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufr&   $   s    !## '999999&& 	G=455g8N8NNNBBBBBBBaaaaaa&&/6}EEFFF    add_prefix_spacereturnc                 :    | rd}t          |dd          sd}nd}|S )NalwayslegacyTfirstnever)getattr)r(   original_tokenizerprepend_schemes      r%   _get_prepend_schemer2   5   s5     !!)8T:: 	%$N r'   c                     |d u}|rt          |          n }g }|                                D ]\  }}g }t          dt          |                    D ]6}|d |         ||d          }	}| v r|	 v r|                    ||	|f           7t          | fd          }|                    |           t          |d |          }d |D             }|S )Nr   c                 <    | d                  | d                  fS Nr   r    )xvocabs    r%   <lambda>z!generate_merges.<locals>.<lambda>J   s    U1Q4[%!+,F r'   keyc                 d    | d         t          | d                   t          | d                   fS )N   r   r   )lenvals    r%   r9   z!generate_merges.<locals>.<lambda>M   s%    SVSQ[[#c!f++,N r'   r;   reversec                 .    g | ]}|d          |d         fS r   r   r6   ).0r@   s     r%   
<listcomp>z#generate_merges.<locals>.<listcomp>N   s%    1113s1vs1v111r'   )dictitemsranger>   appendsortedextend)
r8   vocab_scoresrB   mergesmergepiece_scorelocalindexpiece_lpiece_rs
   `         r%   generate_mergesrU   ?   s   $&G)0;4%%%eLF*0022  {1c%jj)) 	> 	>E$VeV}eEFFmWG%Gu$4$4gw<===u"F"F"F"FGGGeF N NX_```F11&111FMr'   c                   X    e Zd ZdZdefdZddeeeef         e	e         f         fdZ
dS )SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                     t          | d           ddlm}  |            | _        | j                            |           d S )Nr   r   )SentencePieceProcessor)r   r   rZ   spLoad)selfrX   rZ   s      r%   __init__zSentencePieceExtractor.__init__W   sN    $000888888((**Ur'   Nr)   c                     | j         fdt                                                    D             }t          ||          }||fS )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                 <    i | ]}                     |          |S r6   id_to_piecerE   rR   r[   s     r%   
<dictcomp>z2SentencePieceExtractor.extract.<locals>.<dictcomp>d   '    TTT%&&TTTr'   )r[   rI   GetPieceSizerU   r]   rM   r8   rN   r[   s       @r%   extractzSentencePieceExtractor.extract^   sP    
 WTTTT5ARAR;S;STTT 55f}r'   N)__name__
__module____qualname____doc__strr^   r   r   intr   ri   r6   r'   r%   rW   rW   R   so         c    
 
E$sCx.$u+2M,N 
 
 
 
 
 
r'   rW   c                   H    e Zd Zddeeeef         ee         f         fdZdS )GemmaSentencePieceExtractorNr)   c                     | j         fdt                                                    D             }|                    d          |d<   t	          ||          }||fS )r`   c                 <    i | ]}                     |          |S r6   rb   rd   s     r%   re   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>r   rf   r'   <0x09>	)r[   rI   rg   getrU   rh   s       @r%   ri   z#GemmaSentencePieceExtractor.extractl   sg    
 WTTTT5ARAR;S;STTT ii))d 55f}r'   rj   )	rk   rl   rm   r   r   ro   rp   r   ri   r6   r'   r%   rr   rr   k   sJ         E$sCx.$u+2M,N      r'   rr   piecec                 v    t          |           dk     p&| d         dk    p| d                                          S )Nr=   ,)r>   isdigit)rx   s    r%   check_number_commar~   |   s8    u::>HU2Y#-HU2Y5F5F5H5H1HHr'   c                        e Zd Zd ZdefdZdS )	Converterc                     || _         d S rj   )r0   )r]   r0   s     r%   r^   zConverter.__init__   s    "4r'   r)   c                     t                      rj   )NotImplementedErrorr]   s    r%   	convertedzConverter.converted   s    !###r'   N)rk   rl   rm   r^   r	   r   r6   r'   r%   r   r      s>        5 5 5$9 $ $ $ $ $ $r'   r   c                       e Zd ZdefdZdS )BertConverterr)   c           	         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	||f||	fg
          |_        t1          j        d          |_        |S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr0   r8   r	   r   ro   r   hasattrr   tokenize_chinese_charsr   do_lower_caser   BertNormalizer
normalizerr   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr
   decoder
r]   r8   	tokenizerr   r   r   clssepr   r   s
             r%   r   zBertConverter.converted   y   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344.;.;#-#@**3***5555c555l#l#$
 $
 $
	  %.d;;;	r'   Nrk   rl   rm   r	   r   r6   r'   r%   r   r      /        #9 # # # # # #r'   r   c                       e Zd ZdefdZdS )SplinterConverterr)   c           
         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }t	          | j         j                  }d}	| j         j        }
| j         j        }| j         j        }| j                             d          }| j         j        dk    r| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}t3          j        | d| d|||
f||f||f|	|fg          |_        t9          j        d          |_        |S )Nr   Fr   Tr   .rightr    r   r   r   r   r   r   )r0   r8   r	   r   ro   r   r   r   r   r   r   r   r   r   r   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r
   r   )r]   r8   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   s                  r%   r   zSplinterConverter.converted   s   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344t.=>>.;.; 3E.DDSII"/7::HH8HHcHHCHHHHHDDHH3HHHH3HHHHHD#-#@**3***l#l#,-l#		$
 	$
 	$
	  %.d;;;	r'   Nr   r6   r'   r%   r   r      s/        .9 . . . . . .r'   r   c                       e Zd ZdefdZdS )FunnelConverterr)   c           	         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	||f||	fg
          |_        t1          j        d          |_        |S )Nr   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   s
             r%   r   zFunnelConverter.converted   r   r'   Nr   r6   r'   r%   r   r      r   r'   r   c                       e Zd ZdefdZdS )MPNetConverterr)   c                    | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	| d
||f||	fg          |_        t1          j        d          |_        |S )Nr   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   s
             r%   r   zMPNetConverter.converted	  s   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344.;.;#-#@**3***======c===l#l#$
 $
 $
	  %.d;;;	r'   Nr   r6   r'   r%   r   r     r   r'   r   c                       e Zd ZdefdZdS )OpenAIGPTConverterr)   c           
         | j         j        }t          | j         j                                                  }| j         j        }t          t          ||d t          |          dd                    }|	                    t          |                    #|
                    t          |          g           t          j        d          |_        t          j                    |_        t#          j        d          |_        |S )N</w>F)r8   rN   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)r0   encoderlist	bpe_rankskeysr   r	   r   ro   token_to_idadd_special_tokensr   r   r   r   r   r   r
   
BPEDecoderr   r]   r8   rN   r   r   s        r%   r   zOpenAIGPTConverter.converted0  s    '/d-7<<>>??+5	i..#)  	
 	
	   Y00<((#i..)9:::*9DIII	"0"A"C"C	$/v>>>	r'   Nr   r6   r'   r%   r   r   /  s/        9      r'   r   c                   P    e Zd Zddeeef         deeeef                  defdZ	dS )GPT2ConverterNr8   rN   r)   c           
         |s| j         j        }|st          | j         j                  }t	          t          ||d ddd                    }t          | j         dd          }t          j        |          |_	        t          j                    |_        t          | j         dd          r>| j         j        }| j         j        }t          j        | d| d||fg	          |_        nt          j        d
          |_        |S )Nr   Fr8   rN   r   continuing_subword_prefixr   r   r(   r(   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r0   r   r   r   r	   r   r/   r   	ByteLevelr   r
   r   	bos_tokenbos_token_idr   r   r   )r]   r8   rN   r   r(   bosr   s          r%   r   zGPT2Converter.convertedK  s/    	4+3E 	=$1;<<F*,#%  	
 	
	 #4#:<NPUVV"0":L\"]"]"]	$.00	4*OUCC 	P)3C2?L'1'D))),' ( ( (I$$ (2';'O'O'OI$r'   NN
rk   rl   rm   r   ro   rp   r   r   r	   r   r6   r'   r%   r   r   J  sX        " "tCH~ "d5c?>S "_h " " " " " "r'   r   c                       e Zd ZdefdZdS )HerbertConverterr)   c           	      .   d}d}| j         j        }t          | j         j                                                  }||d         d         v r
|dd          }t          t          ||d | j         j        |                    }t          j	        dd          |_
        t          j                    |_        t          j        |          |_        t#          j        | j         j        | j         j        f| j         j        | j         j        f	          |_        |S )
Nz	#version:r   r   r   )r   r   r   F)r   r   r   )r   r   )r0   r   r   r   r   r	   r   r   r   r   r   r   r   r   r
   r   r   r   BertProcessingr   r   r   r   r   )r]   tokenizer_info_strtoken_suffixr8   rN   r   s         r%   r   zHerbertConverter.convertedq  s   ('/d-7<<>>??1--ABBZF1;#/  
 
	  +9EY^___	"0"A"C"C	$/|DDD	#-#<(2D4K4XY(2D4K4XY$
 $
 $
	 
 r'   Nr   r6   r'   r%   r   r   p  /        9      r'   r   c                   P    e Zd Zddeeef         deeeef                  defdZ	dS )Qwen2ConverterNr8   rN   r)   c                 "   |s| j         j        }|s+t          | j         j                                                  }t          t          ||d d dddd                    }t          j                    |_	        t          j        t          j        t          d          dd          t          j        t          | j         dd          d          g          |_        t#          j                    |_        t'          j        d	          |_        |S )
Nr   F)r8   rN   r   r   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr(   r(   	use_regexr   )r0   r   r   r   r   r	   r   r   NFCr   r   SequenceSplitr   r   r/   r   r
   r   r   r   )r]   r8   rN   r   s       r%   r   zQwen2Converter.converted  s/    	4+3E 	D$1;@@BBCCF*,#%#	 	 	
 
	  +00	"0"9$ N  (    (%,T-DFXZ_%`%`#  #
 #
	  %.00	#-#7U#K#K#K	 r'   r   r   r6   r'   r%   r   r     sX        ( (tCH~ (d5c?>S (_h ( ( ( ( ( (r'   r   c                       e Zd ZdefdZdS )RobertaConverterr)   c           
         | j         }|j        }t          |j                                                  }t          t          ||d ddd                    }t          j        |j	                  |_
        t          j                    |_        t          j        |j        |j        f|j        |j        f|j	        d          |_        |S )Nr   Fr   r   Tr   r   r(   r   )r0   r   r   r   r   r	   r   r   r   r(   r   r
   r   r   RobertaProcessingr   r   r   r   r   r]   otr8   rN   r   s        r%   r   zRobertaConverter.converted  s    $
bl''))***,#%  	
 	
	 #1":BL_"`"`"`	$.00	#-#?r/r/0	$
 $
 $
	  r'   Nr   r6   r'   r%   r   r     /        9      r'   r   c                       e Zd ZdefdZdS )RoFormerConverterr)   c           	         ddl m} | j        j        }t	          t          |t          | j        j                                      }d}d}t          | j        d          r"| j        j	        j
        }| j        j	        j        }t          j        dd||          |_        t          j                             ||                    |_        t          | j        j                  }t          | j        j                  }| j        j        }| j        j        }	t/          j        | d| d	| d| d
| d||f||	fg          |_        t5          j        d          |_        |S )Nr   )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr  r0   r8   r	   r   ro   r   r   r   r   r   r   r   r   r   PreTokenizercustomr   r   r   r   r   r   r   r   r
   r   )
r]   r  r8   r   r   r   r   r   r   r   s
             r%   r   zRoFormerConverter.converted  s   IIIIII'-iT=T=^9_9_```aa	4*,=>> 	R 3CQM 3CQM*9!&'#	 
  
  
	 #1"="D"DEVEVW\E]E]"^"^	$)344$)344.;.;#-#@**3***5555c555l#l#$
 $
 $
	  %.d;;;	r'   Nr   r6   r'   r%   r  r    r   r'   r  c                       e Zd ZdefdZdS )DebertaConverterr)   c           
         | j         }|j        }t          |j                                                  }t          t          ||d ddd                    }t          j        |j	                  |_
        t          j                    |_        t          j        ddd| j                             d          fd| j                             d          fg	          |_        |S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r0   r   r   r   r   r	   r   r   r   r(   r   r
   r   r   r   r   r   r   s        r%   r   zDebertaConverter.converted   s    $
bl''))***,#%  	
 	
	 #1":BL_"`"`"`	$.00	#-#@)4$1GGPPQ$1GGPPQ$
 $
 $
	  r'   Nr   r6   r'   r%   r	  r	    r   r'   r	  c                   `     e Zd ZdZeZi Z fdZd Zd Z	d Z
d Zd Zd Zd	 Zd
efdZ xZS )SpmConverterFc                    t          | d            t                      j        |  t                      }|                                }t          | j        j        d          5 }|                    |	                                           d d d            n# 1 swxY w Y   || _
        | j
        j        j        r| j        st          j        d           d S d S d S )Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superr^   r&   
ModelProtoopenr0   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)r]   args	model_pb2mf	__class__s        r%   r^   zSpmConverter.__init__#  s"   $
+++$ $%%	  ""$)4d;; 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(
:"0 	9R 	Me    	 	 	 	s   $(BBBc                 $    d |j         D             S )Nc                 *    g | ]}|j         |j        fS r6   rx   scorerE   rx   s     r%   rF   z&SpmConverter.vocab.<locals>.<listcomp>9  s!    EEEuek*EEEr'   piecesr]   r  s     r%   r8   zSpmConverter.vocab8  s    EEEEEEr'   c                     |j         j        S rj   )r  unk_idr*  s     r%   r,  zSpmConverter.unk_id;  s    !((r'   c           
          |j         j        }                     |          }|dk    r8t          t	          |                     |           j                            }n|dk    r                      j        j	                  
                    |          \  }}d t          |          D             }t          t          |||j         j        d j        d                     }nt          d           fdt          |j                  D             }|                    d	 t#          |d
           D                        |S )Nr   )r,  r   r=   c                      i | ]\  }\  }}||S r6   r6   )rE   iwordr&  s       r%   re   z*SpmConverter.tokenizer.<locals>.<dictcomp>M  s#    QQQ%5QuqQQQr'   T)r   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                 j    g | ]/\  }}|j         d v ||j        |j         dk    p|j        j        v f0S ))      r2  )typerx   r   )rE   idpr]   s      r%   rF   z*SpmConverter.tokenizer.<locals>.<listcomp>b  sR     
 
 
Av !&A+GD4G)GHr'   c                 :    g | ]\  }}}t          |d |          S )F)
normalizedspecial)r   )rE   r5  tokenr9  s       r%   rF   z*SpmConverter.tokenizer.<locals>.<listcomp>h  s=       &Bw 5UGDDD  r'   c                     | d         S Nr   r6   )r7   s    r%   r9   z(SpmConverter.tokenizer.<locals>.<lambda>j  s    QRSTQU r'   r:   )r  
model_typer8   r	   r   r,  r  SpmExtractorr0   r  ri   	enumerater   	unk_piece	Exceptionr)  
add_tokensrK   )	r]   r  r=  rM   r   _rN   	bpe_vocabspm_added_tokenss	   `        r%   r   zSpmConverter.tokenizer>  s   '2
zz%((??! ;;u--"&";   II 1__))$*A*LMMUUVbccIAvQQ<9P9PQQQI!#0:!"&";   	 	II o  
 
 
 
"5<00
 
 

 	 *01A~~*V*V*V  	
 	
 	
 r'   c                 
   |j         j        }t          j        dd          t          j        t          d          d          g}|st          j        |          S t          j        t          j        |          g|z             S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr   StripReplacer   r   Precompiledr]   r  rK  _normalizerss       r%   r   zSpmConverter.normalizerp  s    $4I5555g66
 $ 	h'555')@AU)V)V(WZf(fgggr'   c                 X    t          || j                  }t          j        ||          S Nreplacementr1   )r2   r0   r   	Metaspacer]   rT  r(   r1   s       r%   r   zSpmConverter.pre_tokenizer{  s,    ,-=t?VWW'KP^____r'   c                     d S rj   r6   r   s    r%   r   zSpmConverter.post_processor  s    tr'   c                 X    t          || j                  }t          j        ||          S rR  )r2   r0   r
   rU  rV  s       r%   r   zSpmConverter.decoder  s+    ,-=t?VWW!k.YYYYr'   r)   c                 x   |                      | j                  }|                     | j                  }|||_        d}d}t          | j        d          r| j        j        }|                     ||          }|||_        |                     ||          |_        |                                 }|r||_        |S )NrI  Tr(   )	r   r  r   r   r0   r(   r   r   r   )r]   r   r   rT  r(   r   r   s          r%   r   zSpmConverter.converted  s    NN4:..	 __TZ00
!#-I 4*,>?? 	H#6G**;8HII$&3I# LL6FGG	,,.. 	6'5I$r'   )rk   rl   rm   r  rW   r>  r   r^   r8   r,  r   r   r   r   r   r	   r   __classcell__r"  s   @r%   r  r    s         )LN    *F F F) ) )0 0 0d	h 	h 	h` ` `  Z Z Z9        r'   r  c                        e Zd Zd Zd Zd ZdS )AlbertConverterc                 $    d |j         D             S )Nc                 t    g | ]5}t          |j                  r|j        |j        fn|j        |j        d z
  f6S d   r~   rx   r&  r'  s     r%   rF   z)AlbertConverter.vocab.<locals>.<listcomp>  X     
 
 
 +=U[*I*IoU[%+&&PUP[]b]hkn]nOo
 
 
r'   r(  r*  s     r%   r8   zAlbertConverter.vocab  %    
 

 
 
 	
r'   c                 f   t          j        dd          t          j        dd          g}| j        j        sL|                    t          j                               |                    t          j                               | j        j        r&|                    t          j                               |j	        j
        }|r'|                    t          j        |                     |                    t          j        t          d          d                     t          j        |          S Nz``"z''rH  r   r   rM  r0   keep_accentsrJ   NFKDStripAccentsr   	LowercaserJ  rK  rN  r   r   r]   r  list_normalizersrK  s       r%   r   zAlbertConverter.normalizer     c**c**
 &3 	@##K$4$6$6777##K$<$>$>???"0 	=##K$9$;$;<<<$4I 	S##K$;<P$Q$QRRR 3E'NNC H HIII#$4555r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S Nr  r  r  r  r   r   r   r0   r   r   s    r%   r   zAlbertConverter.post_processor  Y    ,)4$1GGPPQ$1GGPPQ
 
 
 	
r'   Nrk   rl   rm   r8   r   r   r6   r'   r%   r]  r]    A        
 
 
6 6 6&
 
 
 
 
r'   r]  c                       e Zd Zd Zd ZdS )BarthezConverterc                 
    d}|S Nr2  r6   r]   r  r,  s      r%   r,  zBarthezConverter.unk_id      r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   rr  r   s    r%   r   zBarthezConverter.post_processor  Y    , +/EEeLLM0FFvNNO
 
 
 	
r'   N)rk   rl   rm   r,  r   r6   r'   r%   rw  rw    s2          
 
 
 
 
r'   rw  c                        e Zd Zd Zd Zd ZdS )CamembertConverterc                 R    g d}|d |j         dd          D             z  }|dgz  }|S )N))z
<s>NOTUSED        z<pad>r  )z</s>NOTUSEDr  z<unk>r  )z<unk>NOTUSEDic                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z,CamembertConverter.vocab.<locals>.<listcomp>  !    KKK5;,KKKr'   r   z<mask>r  r(  r]   r  r8   s      r%   r8   zCamembertConverter.vocab  sK    
 
 
 	KK%,qrr:JKKKK/""r'   c                     dS ry  r6   r*  s     r%   r,  zCamembertConverter.unk_id  s    qr'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r}  rr  r   s    r%   r   z!CamembertConverter.post_processor  r  r'   Nrk   rl   rm   r8   r,  r   r6   r'   r%   r  r    sA            
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )DebertaV2Converterc                    g }| j         j        r(|                    t          j        d                     t          || j                   }|                    t          j        ||                     t          j        |          S )Nr   )r   rS  )r0   split_by_punctrJ   r   Punctuationr2   rU  r   )r]   rT  r(   list_pretokenizersr1   s        r%   r   z DebertaV2Converter.pre_tokenizer  s    "1 	W%%n&@*&U&U&UVVV,-=t?VWW!!.":{cq"r"r"rsss&'9:::r'   c                    g }| j         j        r&|                    t          j                               |                    t          j                               |j        j        }|r'|                    t          j        |                     |                    t          j	        t          d          d                     t          j        |          S )NrH  r   )r0   r   rJ   r   rl  rL  rJ  rK  rN  rM  r   r   rm  s       r%   r   zDebertaV2Converter.normalizer  s    "0 	=##K$9$;$;<<< 1 3 3444$4I 	S##K$;<P$Q$QRRR 3E'NNC H HIII#$4555r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S rq  rr  r   s    r%   r   z!DebertaV2Converter.post_processor  rs  r'   N)rk   rl   rm   r   r   r   r6   r'   r%   r  r    sA        ; ; ;6 6 6
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )MBartConverterc                 `    g d}|d |j         dd          D             z  }|g dz  }|dgz  }|S )Nr~  r  r  r  r  r  c                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z(MBartConverter.vocab.<locals>.<listcomp>  r  r'   r2  )ar_ARr  cs_CZr  de_DEr  en_XXr  es_XXr  et_EEr  fi_FIr  fr_XXr  gu_INr  hi_INr  it_ITr  ja_XXr  kk_KZr  ko_KRr  lt_LTr  lv_LVr  my_MMr  ne_NPr  nl_XXr  ro_ROr  ru_RUr  si_LKr  tr_TRr  vi_VNr  zh_CNr  r  r(  r  s      r%   r8   zMBartConverter.vocab  sf    
 
 
 	KK%,qrr:JKKKK 
 
 
 	
6 	/""r'   c                     dS ry  r6   r*  s     r%   r,  zMBartConverter.unk_id9      qr'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz$A </s> en_XXz$A $B </s> en_XXr  r  r   rr  r   s    r%   r   zMBartConverter.post_processor<  Y    ,"#$1GGPPQ0FFvNNO
 
 
 	
r'   Nr  r6   r'   r%   r  r    sB        $ $ $L  
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )MBart50Converterc                 `    g d}|d |j         dd          D             z  }|g dz  }|dgz  }|S )Nr  c                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z*MBart50Converter.vocab.<locals>.<listcomp>O  r  r'   r2  )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZAr  )az_AZr  )bn_INr  )fa_IRr  )he_ILr  )hr_HRr  )id_IDr  )ka_GEr  )km_KHr  )mk_MKr  )ml_INr  )mn_MNr  )mr_INr  )pl_PLr  )ps_AFr  )pt_XXr  )sv_SEr  )sw_KEr  )ta_INr  )te_INr  )th_THr  )tl_XXr  )uk_UAr  )ur_PKr  )xh_ZAr  )gl_ESr  )sl_SIr  r  r(  r  s      r%   r8   zMBart50Converter.vocabH  sh    
 
 
 	KK%,qrr:JKKKK  R  R  R  	R/""r'   c                     dS ry  r6   r*  s     r%   r,  zMBart50Converter.unk_idT  r  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nzen_XX $A </s>zen_XX $A $B </s>r  r  r   rr  r   s    r%   r   zMBart50Converter.post_processorW  r  r'   Nr  r6   r'   r%   r  r  G  sA        
 
 
  
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )NllbConverterc                 F    g d}|d |j         dd          D             z  }|S )Nr  c                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z'NllbConverter.vocab.<locals>.<listcomp>j  r  r'   r2  r(  r  s      r%   r8   zNllbConverter.vocabc  >    
 
 
 	KK%,qrr:JKKKKr'   c                     dS ry  r6   r*  s     r%   r,  zNllbConverter.unk_idm  r  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr  r   rr  r   s    r%   r   zNllbConverter.post_processorp  sY    ,%&T4JJ:VVW0FFvNNO
 
 
 	
r'   Nr  r6   r'   r%   r  r  b  sA            
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )SeamlessM4TConverterc                 F    g d}|d |j         dd          D             z  }|S )N)r  r  r  r  c                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>  r  r'   r2  r(  r  s      r%   r8   zSeamlessM4TConverter.vocab|  r  r'   c                     | j         j        S rj   )r0   unk_token_idr*  s     r%   r,  zSeamlessM4TConverter.unk_id  s    &33r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r  r   rr  r   s    r%   r   z#SeamlessM4TConverter.post_processor  sY    ,$%D3II)TTU0FFvNNO
 
 
 	
r'   Nr  r6   r'   r%   r  r  {  sA          4 4 4
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )XLMRobertaConverterc                 R    g d}|d |j         dd          D             z  }|dgz  }|S )Nr  c                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z-XLMRobertaConverter.vocab.<locals>.<listcomp>  r  r'   r2  r  r(  r  s      r%   r8   zXLMRobertaConverter.vocab  sK    
 
 
 	KK%,qrr:JKKKK/""r'   c                 
    d}|S ry  r6   rz  s      r%   r,  zXLMRobertaConverter.unk_id  r{  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S r}  rr  r   s    r%   r   z"XLMRobertaConverter.post_processor  r  r'   Nr  r6   r'   r%   r  r    A        	 	 	  
 
 
 
 
r'   r  c                        e Zd Zd Zd Zd ZdS )XLNetConverterc                 $    d |j         D             S )Nc                 t    g | ]5}t          |j                  r|j        |j        fn|j        |j        d z
  f6S r`  rb  r'  s     r%   rF   z(XLNetConverter.vocab.<locals>.<listcomp>  rc  r'   r(  r*  s     r%   r8   zXLNetConverter.vocab  rd  r'   c                 f   t          j        dd          t          j        dd          g}| j        j        sL|                    t          j                               |                    t          j                               | j        j        r&|                    t          j                               |j	        j
        }|r'|                    t          j        |                     |                    t          j        t          d          d                     t          j        |          S rf  rh  rm  s       r%   r   zXLNetConverter.normalizer  ro  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   rr  r   s    r%   r   zXLNetConverter.post_processor  rs  r'   Nrt  r6   r'   r%   r  r    ru  r'   r  c                       e Zd ZdS )ReformerConverterNrk   rl   rm   r6   r'   r%   r  r            Dr'   r  c                       e Zd Zd Zd ZdS )RemBertConverterc                 >   t          j        dd          t          j        dd          t          j        t          d          d          g}| j        j        sL|                    t          j                               |                    t          j                               | j        j        r&|                    t          j	                               |j
        j        }|r'|                    t          j        |                     t          j        |          S rf  )r   rM  r   r0   ri  rJ   rj  rk  r   rl  rJ  rK  rN  r   rm  s       r%   r   zRemBertConverter.normalizer  s    c**c**g44

 &3 	@##K$4$6$6777##K$<$>$>???"0 	=##K$9$;$;<<<$4I 	S##K$;<P$Q$QRRR#$4555r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S rq  rr  r   s    r%   r   zRemBertConverter.post_processor  rs  r'   N)rk   rl   rm   r   r   r6   r'   r%   r  r    s2        6 6 6&
 
 
 
 
r'   r  c                       e Zd ZdS )BertGenerationConverterNr  r6   r'   r%   r  r    r  r'   r  c                   &    e Zd Zd Zd Zd Zd ZdS )PegasusConverterc                 p   | j         j        df| j         j        dfg}| j         j        || j         j        dfgz  }| j         j        ,| j         j        | j         j        k     r|| j         j        dfgz  }|d t          d| j         j                  D             z  }|d |j        dd          D             z  }|S )Nr  c                     g | ]
}d | ddfS )z<unk_>g      Yr6   rE   r/  s     r%   rF   z*PegasusConverter.vocab.<locals>.<listcomp>  s%    [[[Q<1<<<([[[r'   r=   c                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z*PegasusConverter.vocab.<locals>.<listcomp>  r  r'   )	r0   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrI   r)  r  s      r%   r8   zPegasusConverter.vocab  s    $.4$.4

 "2>t.>DEEE #.:'58O8VVVt.93?@@E[[%4;R;Y2Z2Z[[[[KK%,qrr:JKKKKr'   c                 4    |j         j        | j        j        z   S rj   )r  r,  r0   r'  r*  s     r%   r,  zPegasusConverter.unk_id  s    !(4+B+IIIr'   c                     t          || j                  }t          j        t          j                    t          j        ||          g          S rR  )r2   r0   r   r   WhitespaceSplitrU  rV  s       r%   r   zPegasusConverter.pre_tokenizer  sO    ,-=t?VWW&.00([Q_```
 
 	
r'   c                 p    | j         j        }|| j         j        fg}t          j        d|gdd|g|          S )N$A$Br   )r0   r#  eos_token_idr   r   )r]   eosr   s      r%   r   zPegasusConverter.post_processor  sI    %/$)67
 ,T3KtTSVFWhvwwwwr'   N)rk   rl   rm   r8   r,  r   r   r6   r'   r%   r  r    sX          &J J J
 
 
x x x x xr'   r  c                       e Zd Zd Zd ZdS )T5Converterc                     | j         j        }d |j        D             }|d t          |dz
  dd          D             z  }|S )Nc                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z%T5Converter.vocab.<locals>.<listcomp>'  s!    FFF%+u{+FFFr'   c                     g | ]
}d | ddfS )z
<extra_id_r  r  r6   r   s     r%   rF   z%T5Converter.vocab.<locals>.<listcomp>(  s)    UUUq$$$$c*UUUr'   r   rz   )r0   
_extra_idsr)  rI   )r]   r  num_extra_idsr8   s       r%   r8   zT5Converter.vocab%  sR    /:FFFFFUUE-!:KRQS4T4TUUUUr'   c                 n    t          j        ddgg dd| j                            d          fg          S Nr,  r  )r,  r  r-  r  r   rr  r   s    r%   r   zT5Converter.post_processor+  J    ,&>---0FFvNNO
 
 
 	
r'   N)rk   rl   rm   r8   r   r6   r'   r%   r1  r1  $  s2          
 
 
 
 
r'   r1  c                       e Zd Zd ZdS )UdopConverterc                 n    t          j        ddgg dd| j                            d          fg          S r8  rr  r   s    r%   r   zUdopConverter.post_processor6  r9  r'   Nrk   rl   rm   r   r6   r'   r%   r;  r;  5  s#        
 
 
 
 
r'   r;  c                       e Zd ZdefdZdS )WhisperConverterr)   c           
      `   | j         j        }t          | j         j                                                  }t          t          ||d ddd                    }t          j        | j         j	                  |_
        t          j                    |_        | j         j        }| j                             |          }| j         j        }| j         j        }d                    d |D                       }t%          j        | d| d| d	| d
||fgt)          ||                    |_        |S )Nr   Fr   r   r   c                     g | ]}| d S )r   r6   )rE   r:  s     r%   rF   z.WhisperConverter.converted.<locals>.<listcomp>W  s    #G#G#GUuLLL#G#G#Gr'   z $A:0 r   z $A:0 $B:1 r   r   )r0   r   r   r   r   r	   r   r   r   r(   r   r
   r   prefix_tokensconvert_ids_to_tokensr#  r.  joinr   r   zipr   )	r]   r8   rN   r   prefix_token_idsprefixesr/  r.  prefix_templates	            r%   r   zWhisperConverter.convertedA  sQ   '/d-7<<>>??*,#%  	
 	
	 #1":DLcLt"u"u"u	$.00	2@*@@AQRR%/.;((#G#Gh#G#G#GHH#-#@%44S444#77777l#X/00$
 $
 $
	  r'   Nr   r6   r'   r%   r?  r?  @  s/         9            r'   r?  c                       e Zd Zd ZdS )BigBirdConverterc           	          t          j        ddd| j                            d          fd| j                            d          fg          S rq  rr  r   s    r%   r   zBigBirdConverter.post_processore  rs  r'   Nr=  r6   r'   r%   rJ  rJ  d  s#        
 
 
 
 
r'   rJ  c                       e Zd ZdefdZdS )CLIPConverterr)   c                 
   | j         j        }t          | j         j                                                  }| j         j        }t          t          ||d dddt          |                              }t          j
        t          j                    t          j        t          d          d          t          j                    g          |_        t!          j
        t!          j        t          d          dd	
          t!          j        d          g          |_        t)          j                    |_        t-          j        | j         j        | j         j        f| j         j        | j         j        fdd          |_        |S )Nr   r   Fr8   rN   r   r   r   r   r   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r0   r   r   r   r   r   r	   r   ro   r   r   r   rM  r   rl  r   r   r   r   r   r
   r   r   r   r#  r.  r   r   r   r   s        r%   r   zCLIPConverter.convertedq  ss   '/d-7<<>>??+5	*,#)i..  

 

	  +3_ 3E&MM3 G GI^I`I`a 
  
	 #1"9$Z[[&  
 (%@@@	#
 	#
	 %.00	 $.#?(2D4K4XY(2D4K4XY"	$
 $
 $
	  r'   Nr   r6   r'   r%   rM  rM  p  s/        '9 ' ' ' ' ' 'r'   rM  c                       e Zd ZdefdZdS )LayoutLMv2Converterr)   c           	         | j         j        }t          t          |t	          | j         j                                      }d}d}d}t          | j         d          r3| j         j        j        }| j         j        j	        }| j         j        j
        }t          j        d|||          |_        t          j                    |_        t	          | j         j                  }t	          | j         j                  }| j         j        }| j         j        }	t+          j        | d| d| d| d| d	||f||	fg
          |_        t1          j        d          |_        |S )Nr   FTr   r   r   r   r   r   r   r   r   r   r   s
             r%   r   zLayoutLMv2Converter.converted  sy   '-iT=T=^9_9_```aa	!&4*,=>> 	R%)%<%L%c" 3CQM 3CQM*9!7'#	 
  
  
	 #1"A"C"C	$)344$)344.;.;#-#@**3***5555c555l#l#$
 $
 $
	  %.d;;;	r'   Nr   r6   r'   r%   rR  rR    r   r'   rR  c                       e Zd ZdefdZdS )BlenderbotConverterr)   c           
         | j         }|j        }t          |j                                                  }t          t          ||d ddd                    }t          j        |j	                  |_
        t          j                    |_        t          j        d|j         d|j        |j        fg          |_        |S )Nr   Fr   r   z$A:0 r   )r   r   )r0   r   r   r   r   r	   r   r   r   r(   r   r
   r   r   r   r#  r.  r   r   s        r%   r   zBlenderbotConverter.converted  s    $
bl''))***,#%  	
 	
	 #1":BL_"`"`"`	$.00	#-#@+2<+++r/$
 $
 $
	  r'   Nr   r6   r'   r%   rU  rU    r   r'   rU  c                        e Zd Zd Zd Zd ZdS )XGLMConverterc                 T    g d}|d |j         dd          D             z  }|g dz  }|S )Nr  c                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z'XGLMConverter.vocab.<locals>.<listcomp>  r  r'   r2  ))z<madeupword0>r  )z<madeupword1>r  )z<madeupword2>r  )z<madeupword3>r  )z<madeupword4>r  )z<madeupword5>r  )z<madeupword6>r  r(  r  s      r%   r8   zXGLMConverter.vocab  s[    
 
 
 	KK%,qrr:JKKKK  z  z  z  	zr'   c                 
    d}|S ry  r6   rz  s      r%   r,  zXGLMConverter.unk_id  r{  r'   c           	          t          j        ddd| j                            d          fd| j                            d          fg          S )Nz</s> $Az</s> $A </s> </s> $Br~  r  r   rr  r   s    r%   r   zXGLMConverter.post_processor  sY    ,'/EEeLLM0FFvNNO
 
 
 	
r'   Nr  r6   r'   r%   rX  rX    r
  r'   rX  c                   >    e Zd ZdZeZddhZ	 d Zd Zd Z	d Z
d Zd	S )
GemmaConvertTz<start_of_turn>z<end_of_turn>c                 ,    t          j        dd          S Nr   rI  )r   rM  r*  s     r%   r   zGemmaConvert.normalizer
  s    "3...r'   c                     | j         j        df| j         j        df| j         j        dfg}|j        dd          D ]-}|j        dk    r|d|j        fgz  }||j        |j        fgz  }.|S )Nr  r2  ru   rv   )r0   r"  r#  r   r)  rx   r&  )r]   r  r8   rx   s       r%   r8   zGemmaConvert.vocab  s    $.4$.4$.4

 \!""% 	6 	6E{h&&4-..5;455r'   c                 ,    t          j        dd          S )Nr   merged_with_previous)r   r   r]   rT  r(   s      r%   r   zGemmaConvert.pre_tokenizer  s    #C)?@@@r'   c                 
    d}|S ry  r6   rz  s      r%   r,  zGemmaConvert.unk_id  r{  r'   c                     t          j        t          j        dd          t          j                    t          j                    g          S )NrI  r   )r
   r   rM  ByteFallbackFuserd  s      r%   r   zGemmaConvert.decoder"  sA      ,,%''
 
 	
r'   N)rk   rl   rm   r  rr   r>  r   r   r8   r   r,  r   r6   r'   r%   r^  r^    s|        .L'9N/ / /  A A A  
 
 
 
 
r'   r^  c                   6    e Zd ZdZd Zd Zd Zd Zd Zd Z	dS )	LlamaConverterTc                     | j                             d          df| j                             d          df| j                             d          dfg}|d |j        dd          D             z  }|S )Nr   r  r   r=   c                 *    g | ]}|j         |j        fS r6   r%  r'  s     r%   rF   z(LlamaConverter.vocab.<locals>.<listcomp>5  r  r'   r2  )r0   rC  r)  r  s      r%   r8   zLlamaConverter.vocab/  s    $::1==sC$::1==sC$::1==sC

 	KK%,qrr:JKKKKr'   c                 
    d}|S r<  r6   rz  s      r%   r,  zLlamaConverter.unk_id8  r{  r'   c                     t          j        dd          t          j                    t          j                    g}|r|t          j        dd          gz  }t          j        |          S NrI  r   r   )contentrG  r
   rM  rg  rh  rL  r   r]   rT  r(   sequences       r%   r   zLlamaConverter.decoder<  e    UC((!##MOO

  	>!<<<==H ***r'   c                     t          | j        dd          r_g }t          | j        dd          r|t          j        d          gz  }|t          j        dd          gz  }t          j        |          S d S )Nr,   Tr(   rI  )prependr   )patternrp  )r/   r0   r   PrependrM  r   )r]   r  rs  s      r%   r   zLlamaConverter.normalizerF  s    4*Hd;; 	2Ht.0BDII A[0???@@,S%HHHIIH'111tr'   c                     t          | j        dd          s,t          || j                  }t          j        ||d          S d S )Nr,   TFrT  r1   split)r/   r0   r2   r   rU  rV  s       r%   r   zLlamaConverter.pre_tokenizerO  sL    t.$?? 	q01A4CZ[[N!+Tbjopppptr'   c                     d S rj   r6   r   s    r%   r   zLlamaConverter.post_processorU  s    tr'   N)
rk   rl   rm   r  r8   r,  r   r   r   r   r6   r'   r%   rj  rj  ,  st            + + +        r'   rj  c                       e Zd ZdefdZdS )MarkupLMConverterr)   c                 (   | j         }|j        }t          |j                                                  }t          t          ||d ddd| j         j                            }t          j	        |j
                  |_        t          j	                    |_        t          | j         j                  }t          | j         j                  }| j         j        }| j         j        }t'          j        | d| | d| d| ||f||fg          |_        |S )Nr   FrO  r   z $A z $B r   )r0   r   r   r   r   r	   r   r   r   r   r(   r   r
   r   ro   r   r   r   r   r   r   r   )	r]   r   r8   rN   r   r   r   r   r   s	            r%   r   zMarkupLMConverter.converted[  s1   $
bl''))***,#%1;  

 

	 #1":BL_"`"`"`	$.00	$)344$)344.;.;#-#@$$s$$++S++c++l#l#$
 $
 $
	  r'   Nr   r6   r'   r%   r~  r~  Z  s/        "9 " " " " " "r'   r~  c                   ,    e Zd ZdZddZd Zd Zd ZdS )MoshiConverterTNc                 L   t          | d           t                              | |           t                      }|                                }t          |d          5 }|                    |                                           d d d            n# 1 swxY w Y   || _        d S )Nr   r  )	r   r   r^   r&   r  r  r  r  r  )r]   r  model_max_lengthkwargsr  r   r!  s          r%   r^   zMoshiConverter.__init__  s    $
+++4,,, $%%	  ""*d## 	(qaffhh'''	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	( 	(


s   (BBBc                     |j         j        }t          j        dd          g}|st          j        |          S t          j        t          j        |          g|z             S r`  )rJ  rK  r   rM  r   rN  rO  s       r%   r   zMoshiConverter.normalizer  sg    $4IU++
 $ 	h'555')@AU)V)V(WZf(fgggr'   c                     t          j        dd          t          j                    t          j                    g}|r|t          j        dd          gz  }t          j        |          S ro  rq  rr  s       r%   r   zMoshiConverter.decoder  rt  r'   c                 4    d}t          j        ||d          S )Nr-   Frz  )r   rU  rV  s       r%   r   zMoshiConverter.pre_tokenizer  s!     'KP^fkllllr'   rj   )rk   rl   rm   r  r^   r   r   r   r6   r'   r%   r  r    sc           h h h+ + +m m m m mr'   r  c            	      \   t          t          t          d          t          d          dz                       t          t          t          d          t          d          dz                       z   t          t          t          d          t          d          dz                       z   } | dd         }d	}t          d
          D ]8}|| vr2|                     |           |                    d
|z              |dz  }9d |D             }t	          t          | |                    S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 ,    g | ]}t          |          S r6   )chr)rE   ns     r%   rF   z$bytes_to_unicode.<locals>.<listcomp>  s    			Q#a&&			r'   )r   rI   ordrJ   rG   rE  )bscsr  bs       r%   bytes_to_unicoder    s    	U3s88SXX\**++d5TCIIPQM3R3R.S.SSVZ[`adeiajajloptluluxyly[z[zV{V{{  
AAAB	A4[[  B;;IIaLLLIIdQhFA		"			BBr'   c                   J     e Zd ZdZ	 	 	 	 d fd	ZdefdZd Zd	efd
Z	 xZ
S )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                 l     t                      j        |  || _        || _        || _        || _        d S rj   )r  r^   r  rw  r(   additional_special_tokens)r]   r  rw  r(   r  r  r  r"  s          r%   r^   zTikTokenConverter.__init__  s=     	$$ 0)B&&&r'   tiktoken_urlc                 f   	 ddl m} n# t          $ r t          d          w xY w ||          t	                      fdg }i }                                D ]\  }}|| |          <   t          |          dk    r'g }t          dt          |                    D ]=}|d |         ||d          }
}	|	v r#|
v r|	|
z   v r|                    |	|
|f           >t          |fdd          }|
                    |           t          |d	 d          }fd
|D             }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                 l    d                     fd|                     d          D                       S )Nr   c                 :    g | ]}t          |                   S r6   )r  )rE   charbyte_encoders     r%   rF   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>  s$    TTTLT3TTTr'   zlatin-1)rD  decode)r  r  s    r%   token_bytes_to_stringzPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string  s6    77TTTT@S@STTTUUUr'   r   c                 <    | d                  | d                  fS r5   r6   )r7   r   s    r%   r9   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s    1Q4)AaD/0R r'   FrA   c                     | d         S )Nr=   r6   r?   s    r%   r9   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>  s
    A r'   c                 T    g | ]$} |d                     |d                   f%S rD   r6   )rE   r@   r  s     r%   rF   zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>  s?    cccUX((Q002G2GA2O2OPcccr'   )tiktoken.loadr  rA  
ValueErrorr  rH   r>   rI   rJ   rK   rL   )r]   r  r  rN   r8   r:  rankrQ   rR   rS   rT   r   r  r  s              @@@r%   extract_vocab_merges_from_modelz1TikTokenConverter.extract_vocab_merges_from_model  s   	7777777 	 	 	n  	
 &%l33	'))	V 	V 	V 	V 	V $??,, 
	! 
	!KE426E''../5zzQEq#e**-- ; ;#(%=%-i''Gy,@,@gPWFW\eEeEeLL'7D!9:::5&R&R&R&R\abbbEMM%    $6$6FFFcccc\bcccf}s    &c                     |                      | j                  \  }}t          t          ||d                    }t	          |j        d          rd|j        _        |S )NF)r   ignore_mergesT)r  r  r	   r   r   rX   r  )r]   rM   rN   r   s       r%   r   zTikTokenConverter.tokenizer  s\    #CCDOTTfc,GGGHH	9?O44 	1,0IO)r'   r)   c                 |   |                                  }t          j        t          j        t	          | j                  dd          t          j        | j        d          g          |_        t          j                    |_
        |                    | j                   t          j        d          |_        |S )Nr   Fr   r   r   )r   r   r   r   r   rw  r   r(   r   r
   r   r   r  r   r   )r]   r   s     r%   r   zTikTokenConverter.converted  s    NN$$	"0"9$U4<%8%8:V[\\\($:O[`aaa#
 #
	 %.00	$$T%CDDD#-#7U#K#K#K	 r'   )Nr  FN)rk   rl   rm   rn   r^   ro   r  r   r	   r   rZ  r[  s   @r%   r  r    s           K"&C C C C C CC    >  9        r'   r  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3TokenizerFc                    | j         j        }|t          v r,|s*t          |         } ||                                           S 	 t                              d           t          | j        | j                                                  S # t          $ r7 t          dt          t                                                               w xY w)a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zConverting from Tiktoken)r  r  zConverting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: )r"  rk   SLOW_TO_FAST_CONVERTERSr   loggerinfor  r  r  rA  r  r   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       r%   convert_slow_tokenizerr  I  s      1:C666}612FG455??AAA	KK2333$0;*?*Y   ikk  	 	 	e>BCZC_C_CaCa>b>be e  	s   AB
 
AC)r   )F)Qrn   r  typingr   r   r   	packagingr   
tokenizersr   r   r	   r
   r   r   r   tokenizers.modelsr   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerrk   r  r&   boolro   r2   rU   rW   rr   r~   r   r   r   r   r   r   r   r   r   r   r  r	  r  r]  rw  r  r  r  r  r  r  r  r  r  r  r  r  r1  r;  r?  rJ  rM  rR  rU  rX  r^  rj  r~  r  r  r  r  r  r6   r'   r%   <module>r     s     $ $ $ $ $ $ $ $ $ $       f f f f f f f f f f f f f f f f f f 5 5 5 5 5 5 5 5 5 5 ` ` ` ` ` ` ` ` ` ` ` ` 5 5 5 5 5 5 
	H	%	%G G G G"$ s      &       2    "8   "Ic Id I I I I$ $ $ $ $ $ $ $$ $ $ $ $I $ $ $N/ / / / /	 / / /d$ $ $ $ $i $ $ $N$ $ $ $ $Y $ $ $N       6# # # # #I # # #L    y   >) ) ) ) )Y ) ) )X    y   :$ $ $ $ $	 $ $ $N    y   >~ ~ ~ ~ ~9 ~ ~ ~B"
 "
 "
 "
 "
l "
 "
 "
J
 
 
 
 
| 
 
 
 
 
 
 
 
 
 
 
:
 
 
 
 
 
 
 
B2
 2
 2
 2
 2
\ 2
 2
 2
j
 
 
 
 
| 
 
 
6
 
 
 
 
L 
 
 
2
 
 
 
 
< 
 
 
2
 
 
 
 
, 
 
 
6"
 "
 "
 "
 "
\ "
 "
 "
J	 	 	 	 	 	 	 	
 
 
 
 
| 
 
 
@	 	 	 	 	l 	 	 	%x %x %x %x %x| %x %x %xP
 
 
 
 
, 
 
 
"
 
 
 
 
L 
 
 
! ! ! ! !y ! ! !H	
 	
 	
 	
 	
| 	
 	
 	
( ( ( ( (I ( ( (V$ $ $ $ $) $ $ $N    )   :
 
 
 
 
L 
 
 
6/
 /
 /
 /
 /
< /
 /
 /
d+ + + + +\ + + +\# # # # #	 # # #L&m &m &m &m &m\ &m &m &mT  0G G G G G G G GT::%: (: ]	:
 (: .: ,: ]: : : (: ,: =: -: "=:  !-!:" #: :$ _%:& ':( ]):* (+:, -:. =/:0 +1:2 -3:4 +5:6 $7:8 }9:: *;:< n=:> (?:@ nA:B =C:D $E: : :F ]G:H ,I:J (K:L nM:N mO:P *Q:R (S:T -U:V (W:X *Y:Z 0[:\ M]:^ ;_:` ]a:b (c:d .e:f ng: :h +"$("#s: : : z! !) ! ! ! ! ! !r'   