
    gP>                         d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZmZ ddlZddlmZ ddlmZ ddlmZ erdd	lmZ dd
lmZmZ  ej        e          ZddiZdZ G d de          ZdS )z$Tokenization class for SigLIP model.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)loggingrequires_backends
vocab_filezspiece.modelu   ▁c            
       4    e Zd ZdZeZddgZ	 	 	 	 	 	 	 d(d	eee	e
f                  d
df fdZd Zed             Zd Z	 d)dee         deee                  ded
ee         f fdZdee         d
ee         fdZ	 d*dee         deee                  d
ee         fdZ	 d*dee         deee                  d
ee         fdZd Zd Zde	d
e	fdZdddZd+ddd
ee	         f fdZed              Zd! Zd" Zd# Zd$ Z d*d%e	d&ee	         d
e!e	         fd'Z" xZ#S ),SiglipTokenizera  
    Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"</s>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        model_max_length (`int`, *optional*, defaults to 64):
            The maximum length (in number of tokens) for model inputs.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
    	input_idsattention_mask</s><unk>N@   Tsp_model_kwargsreturnc	                    t          | d           t          |t                    rt          |dddd          n|}t          |t                    rt          |dddd          n|}t          |t                    rt          |dddd          n|}|i n|| _        || _        || _        |                                 | _        || _         t                      j
        d||||| j        ||d|	 d S )NprotobufTF)rstriplstrip
normalizedspecial)	eos_token	unk_token	pad_tokenadditional_special_tokensr   model_max_lengthdo_lower_case )r   
isinstancestrr   r   r&   r   get_spm_processorsp_modelsuper__init__)selfr   r!   r"   r#   r$   r   r%   r&   kwargs	__class__s             j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/siglip/tokenization_siglip.pyr-   zSiglipTokenizer.__init__X   sA    	$
+++ )S))JyduVZ[[[[ 	 )S))JyduVZ[[[[ 	 )S))JyduVZ[[[[ 	 &5%<rr/*$..00$ 		
&? 0-'		
 		
 		
 		
 		
 		
 		
    c                    t          j        di | j        }t          | j        d          5 }|                                }t                      }|j                            |          }|	                                }d|_
        |j                            |           |                                }|                    |           d d d            n# 1 swxY w Y   |S )NrbFr'   )spmSentencePieceProcessorr   openr   readr   
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)r.   	tokenizerfr+   	model_pb2modelr=   s          r1   r*   z!SiglipTokenizer.get_spm_processor   s
   .FF1EFF	$/4(( 	8AvvxxH'))I(33H==E'6688O/4O,!++O<<<..00H--h777	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 	8 s   BCCCc                 4    | j                                         S N)r+   get_piece_sizer.   s    r1   
vocab_sizezSiglipTokenizer.vocab_size   s     }++---r2   c                 |      fdt           j                  D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r'   )convert_ids_to_tokens).0ir.   s     r1   
<dictcomp>z-SiglipTokenizer.get_vocab.<locals>.<dictcomp>   s)    RRRa++A..RRRr2   )rangerI   updateadded_tokens_encoder)r.   vocabs   ` r1   	get_vocabzSiglipTokenizer.get_vocab   s@    RRRR5;Q;QRRRT.///r2   Ftoken_ids_0token_ids_1already_has_special_tokensc                     |r$t                                          ||d          S |dgt          |          z  dgz   S dgt          |          z  dgz   dgt          |          z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rU   rV   rW   Nr      )r,   get_special_tokens_masklen)r.   rU   rV   rW   r0   s       r1   rZ   z'SiglipTokenizer.get_special_tokens_mask   s    $ & 	7722'[]a 3   
 C#k***qc11c+&&&1#-!s;7G7G1GHA3NNr2   	token_idsc                     t          |          dk    r0|d         | j        k    rt          j        d| j         d           |S || j        gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r[   eos_token_idwarningswarnr!   )r.   r\   s     r1   _add_eos_if_not_presentz'SiglipTokenizer._add_eos_if_not_present   si    y>>A)B-43D"D"DM+T^ + + +    1222r2   c                 z    | j         g}|t          ||z             dgz  S t          ||z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r_   r[   )r.   rU   rV   eoss       r1   $create_token_type_ids_from_sequencesz4SiglipTokenizer.create_token_type_ids_from_sequences   sS       !{S())QC//;${2S899QC??r2   c                 h    |                      |          }||S |                      |          }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )rb   )r.   rU   rV   s      r1    build_inputs_with_special_tokensz0SiglipTokenizer.build_inputs_with_special_tokens   sA    & 22;??66{CCK,,r2   c                 B    | j                                         }d |d<   |S )Nr+   )__dict__copy)r.   states     r1   __getstate__zSiglipTokenizer.__getstate__   s$    ""$$ jr2   c                     || _         t          | d          si | _        t          j        di | j        | _        | j                            | j                   d S )Nr   r'   )ri   hasattrr   r5   r6   r+   Loadr   )r.   ds     r1   __setstate__zSiglipTokenizer.__setstate__  s_     t.// 	&#%D 2JJT5IJJ4?+++++r2   textc                 t    |                     t                              ddt          j                            S )N )	translater)   	maketransstringpunctuation)r.   rr   s     r1   remove_punctuationz"SiglipTokenizer.remove_punctuation  s'    ~~cmmBF4FGGHHHr2   keep_punctuation_exact_stringc                     |r5|                      fd|                    |          D                       }n                     |          }t          j        dd|          }|                                }|S )a  Returns canonicalized `text` (puncuation removed).

        Args:
            text (`str`):
                String to be canonicalized.
            keep_punctuation_exact_string (`str`, *optional*):
                If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
                (but will still remove '{' and '}' that appear separately).
        c              3   B   K   | ]}                     |          V  d S rF   )ry   )rM   partr.   s     r1   	<genexpr>z4SiglipTokenizer.canonicalize_text.<locals>.<genexpr>  sB       6 626''--6 6 6 6 6 6r2   z\s+ )joinsplitry   resubstrip)r.   rr   r{   s   `  r1   canonicalize_textz!SiglipTokenizer.canonicalize_text  s     ) 	1055 6 6 6 6:>**Eb:c:c6 6 6  DD **400Dvfc4((zz||r2   r   c                      t                      j        t          |                    t          d          z   fi |}t	          |          dk    r*|d         t          k    r|d         | j        v r
|dd         }|S )z8
        Converts a string to a list of tokens.
        r   rY   r   N)r,   tokenizeSPIECE_UNDERLINEreplacer[   all_special_tokens)r.   rr   add_special_tokensr/   tokensr0   s        r1   r   zSiglipTokenizer.tokenize&  s{     "!"2T\\BRTW5X5X"Xcc\bccv;;??vay,<<<dNeAeAeABBZFr2   c                 t    t          | j                            t          | j                                      S rF   )r[   r+   encoder)   r"   rH   s    r1   unk_token_lengthz SiglipTokenizer.unk_token_length0  s,     4=''DN(;(;<<===r2   c                    |                      |d          }| j                            |t                    }| j                            | j        |z   t                    }t          |          | j        k    r|| j        d         n|S )u*  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE.

        For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

        Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        Nrz   )out_type)r   r+   r   r)   r"   r[   r   )r.   rr   r/   r   s       r1   	_tokenizezSiglipTokenizer._tokenize5  s     %%d$%OO%%dS%99 %%dnt&;c%JJ25f++AV2V2Vvd+--..\bbr2   c                 6    | j                             |          S )z0Converts a token (str) in an id using the vocab.)r+   piece_to_id)r.   tokens     r1   _convert_token_to_idz$SiglipTokenizer._convert_token_to_idJ  s    }((///r2   c                 :    | j                             |          }|S )z=Converts an index (integer) in a token (str) using the vocab.)r+   	IdToPiece)r.   indexr   s      r1   _convert_id_to_tokenz$SiglipTokenizer._convert_id_to_tokenO  s    ''..r2   c                    g }d}d}|D ]N}|| j         v r,|s|dz  }|| j                            |          |z   z  }d}g }7|                    |           d}O|| j                            |          z  }|                                S )z:Converts a sequence of tokens (string) in a single string.rt   Fr   T)r   r+   decodeappendr   )r.   r   current_sub_tokens
out_stringprev_is_specialr   s         r1   convert_tokens_to_stringz(SiglipTokenizer.convert_tokens_to_stringT  s    
 
	( 
	(E///& &#%Jdm223EFFNN
"&%'"""))%000"'dm**+=>>>
!!!r2   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    r:t           j        	                    | j                  rt          | j        |           nzt           j        	                    | j                  sVt          |d          5 }| j                                        }|                    |           d d d            n# 1 swxY w Y   |fS )NzVocabulary path (z) should be a directory-rt   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   r7   r+   serialized_model_protowrite)r.   r   r   out_vocab_fileficontent_spiece_models         r1   save_vocabularyzSiglipTokenizer.save_vocabularyh  sw   w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNSUSZSaSabfbqSrSrNT_n555500 	/nd++ /r'+}'K'K'M'M$-.../ / / / / / / / / / / / / / /   s   (/E##E'*E')r   r   r   NNr   T)NFrF   )F)$__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr   r   r)   r   r-   r*   propertyrI   rT   r   intboolrZ   rb   re   rg   rl   rq   ry   r   r   r   r   r   r   r   r	   r   __classcell__)r0   s   @r1   r   r   ,   s.       & &P *$&67
 "&48/
 /
 "$sCx.1/
 
/
 /
 /
 /
 /
 /
b   . . X.   sxO O9O3;DI3FOkoO	cO O O O O O:	3c 	3tCy 	3 	3 	3 	3 JN@ @9@3;DI3F@	c@ @ @ @0 JN- -9-3;DI3F-	c- - - -6  , , ,Is Is I I I I HL     * [ QUVYQZ       > > X>c c c*0 0 0
  
" " "(! !c !HSM !]bcf]g ! ! ! ! ! ! ! !r2   r   ) r   r   r   rw   r`   shutilr   typingr   r   r   r   r   r	   sentencepiecer5   convert_slow_tokenizerr   tokenization_utilsr   tokenization_utils_baser   r   utilsr   r   
get_loggerr   r   r   r   r   r'   r2   r1   <module>r      sO   + * 				 				         B B B B B B B B B B B B B B B B     5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1  5444444 / / / / / / / / 
	H	%	%!>2   K! K! K! K! K!) K! K! K! K! K!r2   