
    g`P                     
   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
ZddlmZmZmZmZmZ ddlmZ  ej        e          Zdd	d
Z e            d             Zd Zd Zd Z G d d          Z G d de          ZdS )zTokenization classes for CLIP.    N)	lru_cache)ListOptionalTuple   )
AddedTokenPreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec            	      \   t          t          t          d          t          d          dz                       t          t          t          d          t          d          dz                       z   t          t          t          d          t          d          dz                       z   } | dd         }d	}t          d
          D ]8}|| vr2|                     |           |                    d
|z              |dz  }9d |D             }t	          t          | |                    S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~      ¡   ¬   ®   ÿNr      c                 ,    g | ]}t          |          S  )chr).0ns     f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/clip/tokenization_clip.py
<listcomp>z$bytes_to_unicode.<locals>.<listcomp>:   s    			Q#a&&			    )listrangeordappenddictzip)bscsr   bs       r   bytes_to_unicoder*   %   s    	U3s88SXX\**++d5TCIIPQM3R3R.S.SSVZ[`adeiajajloptluluxyly[z[zV{V{{  
AAAB	A4[[  B;;IIaLLLIIdQhFA		"			BBr    c                 ~    t                      }| d         }| dd         D ]}|                    ||f           |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charchars       r   	get_pairsr2   >   sP     EEEQIQRR  		9d#$$$		Lr    c                 Z    t          j        dd|           } |                                 } | S )Nz\s+ )resubstrip)texts    r   whitespace_cleanr9   L   s'    6&#t$$D::<<DKr    c                 ^    |                                  } | sg S |                                 }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)r7   split)r8   tokenss     r   whitespace_tokenizer=   S   s.    ::<<D 	ZZ\\FMr    c                   L    e Zd ZdZ	 	 	 	 	 ddZddZd ZddZd Zd	 Z	d
 Z
dS )BasicTokenizera  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    TNc                 n    |g }|| _         t          |          | _        || _        || _        || _        d S N)do_lower_caser,   never_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfrB   rC   rD   rE   rF   s         r   __init__zBasicTokenizer.__init__t   sE     K*{++&<#* 0r    c                 h   |r'| j                             t          |                    n| j         }|                     |          }| j        r|                     |          }t          j        d|          }t          |          }g }|D ]}||vrV| j	        r3|
                                }| j        dur|                     |          }n| j        r|                     |          }|                    |                     ||                     t          d                    |                    }|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        NFCFr4   )rC   unionr,   _clean_textrD   _tokenize_chinese_charsunicodedata	normalizer=   rB   lowerrE   _run_strip_accentsextend_run_split_on_puncjoin)rG   r8   rC   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r   tokenizezBasicTokenizer.tokenize   sE    CNcd&,,S-=-=>>>SWSc%% & 	6//55D"-"7t"D"D)*ABB  	M 	MEK''% ;!KKMME)66 $ 7 7 > >' ; 33E::E 7 7{ K KLLLL+CHH\,B,BCCr    c                     t          j        d|          }g }|D ]2}t          j        |          }|dk    r|                    |           3d                    |          S )z$Strips accents from a piece of text.NFDMn )rN   rO   categoryr$   rT   )rG   r8   outputr1   cats        r   rQ   z!BasicTokenizer._run_strip_accents   si    $UD11 	  	 D&t,,Cd{{MM$wwvr    c                    | j         r|||v r|gS t          |          }d}d}g }|t          |          k     r|||         }t          |          r|                    |g           d}n4|r|                    g            d}|d                             |           |dz  }|t          |          k     |d |D             S )z&Splits punctuation on a piece of text.Nr   TFr   c                 8    g | ]}d                      |          S )r^   )rT   )r   xs     r   r   z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>   s"    +++q

+++r    )rF   r!   lenr   r$   )rG   r8   rC   charsistart_new_wordr`   r1   s           r   rS   z!BasicTokenizer._run_split_on_punc   s    $ 	)@T[EXEX6MT

#e**nn8Dt$$ (tf%%%!%! &MM"%%%!&r
!!$'''FA #e**nn ,+F++++r    c                 ,   g }|D ]{}t          |          }|                     |          r@|                    d           |                    |           |                    d           f|                    |           |d                    |          S )z)Adds whitespace around any CJK character.r4   r^   )r#   _is_chinese_charr$   rT   rG   r8   r`   r1   cps        r   rM   z&BasicTokenizer._tokenize_chinese_chars   s     	$ 	$DTB$$R(( $c"""d###c""""d####wwvr    c                     |dk    r|dk    sT|dk    r|dk    sH|dk    r|dk    s<|dk    r|dk    s0|d	k    r|d
k    s$|dk    r|dk    s|dk    r|dk    s|dk    r|dk    rdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TFr   )rG   rm   s     r   rk   zBasicTokenizer._is_chinese_char   s     6\\bFllfvg"--g"--g"--g"--fvg"--4ur    c                    g }|D ]g}t          |          }|dk    s|dk    st          |          r-t          |          r|                    d           R|                    |           hd                    |          S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r4   r^   )r#   r
   r   r$   rT   rl   s        r   rL   zBasicTokenizer._clean_text   s     	$ 	$DTBQww",,+d*;*;,d## $c""""d####wwvr    )TNTNTrA   )__name__
__module____qualname____doc__rH   rZ   rQ   rS   rM   rk   rL   r   r    r   r?   r?   ]   s         0 #1 1 1 1 $ $ $ $L	 	 	, , , ,,    0    r    r?   c            
           e Zd ZdZeZddgZ	 	 	 	 	 d fd	Zed             Z	d	 Z
	 ddee         deee                  dee         fdZ	 ddee         deee                  dedee         f fdZ	 ddee         deee                  dee         fdZd Zd Zd Zd Zd Zddedee         dee         fdZ xZS )CLIPTokenizera  
    Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_maskreplace<|endoftext|><|startoftext|>c           
         t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}	 dd l}	|	j        | _        nG# t
          $ r: t                              d           t          dd          | _	        d | _        Y nw xY wt          |d          5 }
t          j        |
          | _        d d d            n# 1 swxY w Y   d | j                                        D             | _        || _        t#                      | _        d	 | j                                        D             | _        t          |d          5 }|                                                                                    d
          dd         }d d d            n# 1 swxY w Y   d |D             }t/          t1          |t3          t5          |                                        | _        ddd| _        t;          j        dt:          j                  | _          tC                      j"        d|||||d| d S )NF)lstriprstripr   zKftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.)rE   rF   utf-8encodingc                     i | ]\  }}||	S r   r   r   kvs      r   
<dictcomp>z*CLIPTokenizer.__init__.<locals>.<dictcomp>4  s    >>>A1>>>r    c                     i | ]\  }}||	S r   r   r   s      r   r   z*CLIPTokenizer.__init__.<locals>.<dictcomp>7  s    HHHdaQHHHr    
r   i  c                 P    g | ]#}t          |                                          $S r   )tupler;   )r   merges     r   r   z*CLIPTokenizer.__init__.<locals>.<listcomp>:  s(    CCCueEKKMM**CCCr    rz   ry   )rz   ry   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)errors	unk_token	bos_token	eos_token	pad_tokenr   )#
isinstancestrr   ftfyfix_textImportErrorloggerinfor?   nlpopenjsonloadencoderitemsdecoderr   r*   byte_encoderbyte_decoderreadr7   r;   r%   r&   r"   rf   	bpe_rankscacher5   compile
IGNORECASEpatsuperrH   )rG   r   r   r   r   r   r   r   kwargsr   vocab_handlemerges_handle
bpe_merges	__class__s                r   rH   zCLIPTokenizer.__init__  s    JTT]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir	IST]_bIcIcrJyuEEEEir		!KKK MDMM 	! 	! 	!KKefff%EERRRDH DMMM	!
 *w/// 	3<9\22DL	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3>>););)=)=>>>,..HHd.?.E.E.G.GHHH+000 	[M&++--3355;;DAA!FYBYZJ	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[ 	[CC
CCC
c*eC
OO.D.DEEFF):_]]
:nM
 

 	 	
	
 	
 	
 	
 	
 	
 	
s8   >B ACC'DDDAG!!G%(G%c                 *    t          | j                  S rA   )rf   r   rG   s    r   
vocab_sizezCLIPTokenizer.vocab_sizeL  s    4<   r    c                 0    t          | j        fi | j        S rA   )r%   r   added_tokens_encoderr   s    r   	get_vocabzCLIPTokenizer.get_vocabP  s    DL>>D$=>>>r    Ntoken_ids_0token_ids_1returnc                 X    | j         g}| j        g}|||z   |z   S ||z   |z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CLIP sequence has the following format:

        - single sequence: `<|startoftext|> X <|endoftext|>`

        Pairs of sequences are not the expected use case, but they will be handled without a separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )bos_token_ideos_token_idrG   r   r   r   r   s        r    build_inputs_with_special_tokensz.CLIPTokenizer.build_inputs_with_special_tokensS  sO    ( &'	&'	{*Y66;&2Y>LyXXr    Falready_has_special_tokensc                     |r$t                                          ||d          S |dgdgt          |          z  z   dgz   S dgdgt          |          z  z   dgz   dgz   dgt          |          z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr   r   )r   get_special_tokens_maskrf   )rG   r   r   r   r   s       r   r   z%CLIPTokenizer.get_special_tokens_maskn  s    & & 	7722'[]a 3    31#K 0 001QC77sqcC,,,-3qc9aS3{CSCS=STXYWZZZr    c                     | j         g}| j        g}|t          ||z   |z             dgz  S t          ||z   |z   |z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
        zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r   r   rf   r   s        r   $create_token_type_ids_from_sequencesz2CLIPTokenizer.create_token_type_ids_from_sequences  sp      &'	&'	y;.:;;qcAA9{*Y6B[PS\\]]ab`cccr    c                     | j         v r j         |         S t          |d d                   |d         dz   fz   }t          |          }|s|dz   S 	 t          | fd          }| j        vrn8|\  }}g }d}|t          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                     Y nw xY w||         |k    rC|t          |          dz
  k     r-||dz            |k    r|	                    ||z              |dz  }n |	                    ||                    |dz  }|t          |          k     t          |          }|}t          |          dk    rnt          |          }Wd	
                    |          }| j         |<   |S )
Nrc   </w>Tc                 T    j                             | t          d                    S )Ninf)r   getfloat)pairrG   s    r   <lambda>z#CLIPTokenizer.bpe.<locals>.<lambda>  s     1C1CD%PU,,1W1W r    keyr   r      r4   )r   r   r2   minr   rf   indexrR   
ValueErrorr$   rT   )
rG   rX   r.   r/   bigramfirstsecondnew_wordrh   js
   `         r   bpezCLIPTokenizer.bpe  s   DJ:e$$U3B3Z  E"I$6#88$ 	"6>!	($W$W$W$WXXXFT^++"ME6HAc$ii--

5!,,A
 OOD1I...AA "   OODH---E 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA c$ii--  XHD4yyA~~!$9	(: xx~~ 
5s   C 'C/.C/c                 
    g } j         .d                     j                            |                    }n4t	                                |                                                    }t          j         j        |          D ]{}d                     fd|	                    d          D                       }|
                    d                      |                              d          D                        ||S )zTokenize a string.Nr4   r^   c              3   2   K   | ]}j         |         V  d S rA   )r   )r   r)   rG   s     r   	<genexpr>z*CLIPTokenizer._tokenize.<locals>.<genexpr>  s<        )*!!$     r    r~   c              3      K   | ]}|V  d S rA   r   )r   	bpe_tokens     r   r   z*CLIPTokenizer._tokenize.<locals>.<genexpr>  s"      TTIiTTTTTTr    )r   rT   r   rZ   r9   rP   r5   findallr   encoderR   r   r;   )rG   r8   
bpe_tokensrX   s   `   r   	_tokenizezCLIPTokenizer._tokenize  s   
= 88DH--d3344DD#DMM$$7$788>>@@DZ$// 	U 	UEGG    .3ll7.C.C    E TT%9N9Ns9S9STTTTTTTr    c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)r   r   r   )rG   rX   s     r   _convert_token_to_idz"CLIPTokenizer._convert_token_to_id  s,    |t|'7'7'G'GHHHr    c                 6    | j                             |          S )z=Converts an index (integer) in a token (str) using the vocab.)r   r   )rG   r   s     r   _convert_id_to_tokenz"CLIPTokenizer._convert_id_to_token  s    |&&&r    c                      d                     |          }t           fd|D                       }|                    d j                                      dd                                          }|S )z:Converts a sequence of tokens (string) in a single string.r^   c                 *    g | ]}j         |         S r   )r   )r   crG   s     r   r   z:CLIPTokenizer.convert_tokens_to_string.<locals>.<listcomp>  s!    CCC 1! 4CCCr    r~   )r   r   r4   )rT   	bytearraydecoder   rx   r7   )rG   r<   r8   
byte_arrays   `   r   convert_tokens_to_stringz&CLIPTokenizer.convert_tokens_to_string  so    wwvCCCCdCCCDD
   ==EEfcRRXXZZr    save_directoryfilename_prefixc           	         t           j                            |          s/t                              d                    |                     d S t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t          |dd          5 }|	                    t          j        | j        d	d
d          dz              d d d            n# 1 swxY w Y   d}t          |dd          5 }|	                    d           t          | j                                        d           D ]j\  }}	||	k    r/t                              d                    |                     |	}|	                    d                    |          dz              |dz  }k	 d d d            n# 1 swxY w Y   ||fS )Nz*Vocabulary path ({}) should be a directory-r^   r   r   wr~   r   r   TF)indent	sort_keysensure_asciir   r   z#version: 0.2
c                     | d         S )Nr   r   )kvs    r   r   z/CLIPTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^ r    r   zqSaving vocabulary to {}: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r4   r   )ospathisdirr   errorformatrT   VOCAB_FILES_NAMESr   writer   dumpsr   sortedr   r   warning)
rG   r   r   r   
merge_filefr   writerr   token_indexs
             r   save_vocabularyzCLIPTokenizer.save_vocabulary  s   w}}^,, 	LLELL^\\]]]FW\\oM_s222QbcoQpp
 

 W\\oM_s222QbcpQqq
 

 *cG444 	cGGDJt|ATYZZZ]aabbb	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c *cG444 
	LL*++++1$.2F2F2H2HN^N^+_+_+_  '
KK''NNMMSVT^M_M_   (ESXXj11D8999

	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 :%%s%   4DDD*B*G!!G%(G%)rx   ry   rz   ry   ry   rA   )NF)rp   rq   rr   rs   r   vocab_files_namesmodel_input_namesrH   propertyr   r   r   intr   r   boolr   r   r   r   r   r   r   r   r   r   __classcell__)r   s   @r   ru   ru      s'        2 *$&67 !#!!/
 /
 /
 /
 /
 /
b ! ! X!? ? ? JNY Y9Y3;DI3FY	cY Y Y Y8 sx[ [9[3;DI3F[ko[	c[ [ [ [ [ [: JNd d9d3;DI3Fd	cd d d d.( ( (T  I I I' ' '  & &c &HSM &]bcf]g & & & & & & & &r    ru   )rs   r   r   rN   	functoolsr   typingr   r   r   regexr5   tokenization_utilsr   r	   r
   r   r   utilsr   
get_loggerrp   r   r   r*   r2   r9   r=   r?   ru   r   r    r   <module>r     s   % $  				           ( ( ( ( ( ( ( ( ( (     o o o o o o o o o o o o o o       
	H	%	%      0      ^ ^ ^ ^ ^ ^ ^ ^BF& F& F& F& F&' F& F& F& F& F&r    