
    gy                        d Z ddlZddlZddlmZmZ ddlZddlm	Z	 ddl
mZ  ej        e          Zddd	Zi d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+i d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMi dNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtZdu Z G dv dwe	          ZdS )xz)Tokenization classes for Salesforce CTRL.    N)OptionalTuple   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_file	Pregnancyi Christianityi  Explaini Fitnessi  Savingi  Aski#j  Assiv Jokei~ 	Questionsi6  Thoughtsi  Retailiv  Feminismi Writingi.  Atheismi Netflixi  	Computingiך  Opinioniͨ  Alonei  Funnyi%  Gamingi  Humani  Indiai3  JokeriR- Dietin  LegaliS.  NormaniK  Tipi Weightiw  Moviesi  Runningi[  Sciencei*  Horrori  
Confessioni  Financei/  Politicsi?  Scaryi Supportin1  Technologiesi  Teenageip Eventi  Learnedi Notioni 	Wikipediaiϒ  Booksi	  Extracti) Confessionsi- 
Conspiracyi( Linksi  	NarcissusiK Relationshipi  Relationshipsi iǢ  i  ih  i )ReviewsNewsTranslationmultilingualc                     t                      }| d         }| dd         D ]}|                    ||f           |}t          |          }|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchars       f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/ctrl/tokenization_ctrl.py	get_pairsrJ   ^   s[     EEEQIQRR  		9d#$$$		JJEL    c                        e Zd ZdZeZeZd fd	Ze	d             Z
d Zd Zd Zd Zd	 Zd
 Zddedee         dee         fdZ xZS )CTRLTokenizera`  
    Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    <unk>c           
      `   t          |d          5 }t          j        |          | _        d d d            n# 1 swxY w Y   d | j                                        D             | _        t          |d          5 }|                                                    d          dd         }d d d            n# 1 swxY w Y   d |D             }t          t          |t          t          |                                        | _        i | _         t                      j        d	d|i| d S )
Nutf-8encodingc                     i | ]\  }}||	S  rT   ).0kvs      rI   
<dictcomp>z*CTRLTokenizer.__init__.<locals>.<dictcomp>   s    >>>A1>>>rK   
rB   c                 P    g | ]#}t          |                                          $S rT   )tuplesplit)rU   merges     rI   
<listcomp>z*CTRLTokenizer.__init__.<locals>.<listcomp>   s(    ;;;5%&&;;;rK   	unk_tokenrT   )openjsonloadencoderitemsdecoderreadr]   dictziprangelen	bpe_rankscachesuper__init__)	selfr   r	   r`   kwargsvocab_handlemerges_handlemerges	__class__s	           rI   ro   zCTRLTokenizer.__init__   s   *w/// 	3<9\22DL	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3>>););)=)=>>>+000 	<M"''))//55ad;F	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	<;;F;;;c&%F*<*<==>>
779777777s   9= ==0B99B= B=c                 *    t          | j                  S N)rk   rd   rp   s    rI   
vocab_sizezCTRLTokenizer.vocab_size   s    4<   rK   c                 0    t          | j        fi | j        S rw   )rh   rd   added_tokens_encoderrx   s    rI   	get_vocabzCTRLTokenizer.get_vocab   s    DL>>D$=>>>rK   c                     | j         v r j         |         S t          |          }t          t          |d d                   |d         dz   gz             }t          |          }|s|S 	 t	          | fd          }| j        vrn8|\  }}g }d}|t          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                     Y nw xY w||         |k    rC|t          |          dz
  k     r-||dz            |k    r|
                    ||z              |dz  }n |
                    ||                    |dz  }|t          |          k     t          |          }|}t          |          dk    rnt          |          }Wd	                    |          }|d d
         }| j         |<   |S )NrZ   z</w>Tc                 T    j                             | t          d                    S )Ninf)rl   getfloat)pairrp   s    rI   <lambda>z#CTRLTokenizer.bpe.<locals>.<lambda>   s     1C1CD%PU,,1W1W rK   keyr   rB      @@ )rm   r\   listrJ   minrl   rk   indexextend
ValueErrorappendjoin)
rp   tokenrE   rF   bigramfirstsecondnew_wordijs
   `         rI   bpezCTRLTokenizer.bpe   s#   DJ:e$$U||T$ss)__R6(9'::;;$ 	L	($W$W$W$WXXXFT^++"ME6HAc$ii--

5!,,A
 OOD1I...AA "   OODH---E 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA c$ii--  XHD4yyA~~!$9	(: zz$CRCy 
5s   (C 'DDc                     g }t          j        d|          }|D ]J}|                    t          |                     |                              d                               K|S )zTokenize a string.z\S+\n? )refindallr   r   r   r]   )rp   textsplit_tokenswordsr   s        rI   	_tokenizezCTRLTokenizer._tokenize   sf    
9d++ 	B 	BETXXe__%:%:3%?%? @ @AAAArK   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)rd   r   r`   )rp   r   s     rI   _convert_token_to_idz"CTRLTokenizer._convert_token_to_id   s,    |t|'7'7'G'GHHHrK   c                 B    | j                             || j                  S )z=Converts an index (integer) in a token (str) using the vocab.)rf   r   r`   )rp   r   s     rI   _convert_id_to_tokenz"CTRLTokenizer._convert_id_to_token   s    |t~666rK   c                 |    d                     |                              dd                                          }|S )z:Converts a sequence of tokens (string) in a single string.r   r    )r   replacestrip)rp   tokens
out_strings      rI   convert_tokens_to_stringz&CTRLTokenizer.convert_tokens_to_string   s5    XXf%%--eR88>>@@
rK   Nsave_directoryfilename_prefixreturnc           	         t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t          |dd	          5 }|                    t          j
        | j        d
dd          dz              d d d            n# 1 swxY w Y   d}t          |dd	          5 }|                    d           t          | j                                        d           D ][\  }}	||	k    r t                              d| d           |	}|                    d                    |          dz              |dz  }\	 d d d            n# 1 swxY w Y   ||fS )NzVocabulary path (z) should be a directory-r   r   r	   wrP   rQ   r   TF)indent	sort_keysensure_asciirY   r   z#version: 0.2
c                     | d         S )NrB   rT   )kvs    rI   r   z/CTRLTokenizer.save_vocabulary.<locals>.<lambda>   s    Y[\]Y^ rK   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   rB   )ospathisdirloggererrorr   VOCAB_FILES_NAMESra   writerb   dumpsrd   sortedrl   re   warning)
rp   r   r   r   
merge_filefr   writer
bpe_tokenstoken_indexs
             rI   save_vocabularyzCTRLTokenizer.save_vocabulary   s   w}}^,, 	LLT^TTTUUUFW\\oM_s222QbcoQpp
 

 W\\oM_s222QbcpQqq
 

 *cG444 	cGGDJt|ATYZZZ]aabbb	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c *cG444 
	LL*++++1$.2F2F2H2HN^N^+_+_+_  '
KK''NNM
 M M M   (ESXXj11D8999

	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 :%%s%   <4C<<D D BGG
G)rN   rw   )__name__
__module____qualname____doc__r   vocab_files_namesCONTROL_CODEScontrol_codesro   propertyry   r|   r   r   r   r   r   strr   r   r   __classcell__)ru   s   @rI   rM   rM   n   s          *!M	8 	8 	8 	8 	8 	8 ! ! X!? ? ?* * *X  I I I7 7 7  
& &c &HSM &]bcf]g & & & & & & & &rK   rM   )r   rb   r   typingr   r   regexr   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   rJ   rM   rT   rK   rI   <module>r      s   0 /  				 " " " " " " " "     5 5 5 5 5 5       
	H	%	%   88D8 v8 u	8
 e8 
58 
58 F8 8 8 e8 8 u8 v8 u8  !8" u#8 8$ U%8& U'8( e)8* T+8, T-8. U/80 E182 U384 d586 
5788 e98: e;8< u=8> t?8@ eA8B %C8D uE8 8 8F G8H VI8J uK8L EM8N uO8P UQ8R uS8T fU8V W8X TY8Z u[8\ 6]8^ %_8` Ua8b c8d Ee8f Vg8 8h o8 8 8v   D& D& D& D& D&' D& D& D& D& D&rK   