
    g"                         d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlmZ ddlmZ dd	lmZ  ej        e          Zd
diZ G d de          ZdS )z Tokenization class for SpeechT5.    N)copyfile)AnyDictListOptionalTuple   )PreTrainedTokenizer)logging   )EnglishNumberNormalizer
vocab_filezspm_char.modelc            
           e Zd ZdZeZddgZ	 	 	 	 	 	 d!d
eee	e
f                  dd	f fdZd"dZed             Zed             Zej        d             Zd Zd Zd Zde	dee	         fdZd Zd Zd Zd#dee         fdZ	 d$dee         deee                  dedee         f fdZd#de	dee	         dee	         fd Z xZS )%SpeechT5Tokenizera	  
    Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The begin of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        normalize (`bool`, *optional*, defaults to `False`):
            Whether to convert numeric quantities in the text to their spelt-out english counterparts.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    	input_idsattention_mask<s></s><unk><pad>FNsp_model_kwargsreturnc           
          |i n|| _         || _        || _        d | _        t	          j        di | j         | _        | j                            |            t                      j	        d|||||| j         d| d S )N)	bos_token	eos_token	unk_token	pad_token	normalizer    )
r   r   r   _normalizerspmSentencePieceProcessorsp_modelLoadsuper__init__)
selfr   r   r   r   r   r   r   kwargs	__class__s
            n/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/speecht5/tokenization_speecht5.pyr&   zSpeechT5Tokenizer.__init__O   s     &5%<rr/$"2JJT5IJJ:&&& 	
 0	
 	
 	
 	
 	
 	
 	
    c                 |    |                     d| j                  }|rd|z   }|r|                     |          }||fS )Nr    )popr   
normalizer)r'   textis_split_into_wordsr(   r   s        r*   prepare_for_tokenizationz*SpeechT5Tokenizer.prepare_for_tokenizationl   sK    JJ{DN;;	 	:D 	)??4((Df~r+   c                 4    | j                                         S N)r#   get_piece_sizer'   s    r*   
vocab_sizezSpeechT5Tokenizer.vocab_sizet   s    }++---r+   c                 D    | j         t                      | _         | j         S r4   )r    r   r6   s    r*   r/   zSpeechT5Tokenizer.normalizerx   s"    #688Dr+   c                     || _         d S r4   )r    )r'   values     r*   r/   zSpeechT5Tokenizer.normalizer~   s     r+   c                 |      fdt           j                  D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r   )convert_ids_to_tokens).0ir'   s     r*   
<dictcomp>z/SpeechT5Tokenizer.get_vocab.<locals>.<dictcomp>   s)    RRRa++A..RRRr+   )ranger7   updateadded_tokens_encoder)r'   vocabs   ` r*   	get_vocabzSpeechT5Tokenizer.get_vocab   s@    RRRR5;Q;QRRRT.///r+   c                 B    | j                                         }d |d<   |S )Nr#   )__dict__copy)r'   states     r*   __getstate__zSpeechT5Tokenizer.__getstate__   s$    ""$$ jr+   c                     || _         t          | d          si | _        t          j        di | j        | _        | j                            | j                   d S )Nr   r   )rG   hasattrr   r!   r"   r#   r$   r   )r'   ds     r*   __setstate__zSpeechT5Tokenizer.__setstate__   s_     t.// 	&#%D 2JJT5IJJ4?+++++r+   r0   c                 D    | j                             |t                    S )zPTake as input a string and return a list of strings (tokens) for words/sub-words)out_type)r#   encodestr)r'   r0   s     r*   	_tokenizezSpeechT5Tokenizer._tokenize   s    }##D3#777r+   c                 6    | j                             |          S )z0Converts a token (str) in an id using the vocab.)r#   piece_to_id)r'   tokens     r*   _convert_token_to_idz&SpeechT5Tokenizer._convert_token_to_id   s    }((///r+   c                 :    | j                             |          }|S )z=Converts an index (integer) in a token (str) using the vocab.)r#   	IdToPiece)r'   indexrV   s      r*   _convert_id_to_tokenz&SpeechT5Tokenizer._convert_id_to_token   s    ''..r+   c                    g }d}d}|D ]N}|| j         v r,|s|dz  }|| j                            |          |z   z  }d}g }7|                    |           d}O|| j                            |          z  }|                                S )z:Converts a sequence of tokens (string) in a single string. Fr-   T)all_special_tokensr#   decodeappendstrip)r'   tokenscurrent_sub_tokens
out_stringprev_is_specialrV   s         r*   convert_tokens_to_stringz*SpeechT5Tokenizer.convert_tokens_to_string   s    
 
	( 
	(E///& &#%Jdm223EFFNN
"&%'"""))%000"'dm**+=>>>
!!!r+   c                 8    ||| j         gz   S ||z   | j         gz   S )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r'   token_ids_0token_ids_1s      r*    build_inputs_with_special_tokensz2SpeechT5Tokenizer.build_inputs_with_special_tokens   s/    $"3!444[(D,=+>>>r+   ri   rj   already_has_special_tokensc                     |r$t                                          ||d          S dg}|dgt          |          z  |z   S dgt          |          z  dgt          |          z  z   |z   S )NT)ri   rj   rl   r   r   )r%   get_special_tokens_masklen)r'   ri   rj   rl   suffix_onesr)   s        r*   rn   z)SpeechT5Tokenizer.get_special_tokens_mask   s     & 	7722'[]a 3    cC#k***k99c+&&&A3[1A1A+AB[PPr+   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    r:t           j        	                    | j                  rt          | j        |           nzt           j        	                    | j                  sVt          |d          5 }| j                                        }|                    |           d d d            n# 1 swxY w Y   |fS )NzVocabulary path (z) should be a directory-r]   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr#   serialized_model_protowrite)r'   rq   rr   out_vocab_fileficontent_spiece_models         r*   save_vocabularyz!SpeechT5Tokenizer.save_vocabulary   sw   w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 7??4?++rw~/N/NNNSUSZSaSabfbqSrSrNT_n555500 	/nd++ /r'+}'K'K'M'M$-.../ / / / / / / / / / / / / / /   s   (/E##E'*E')r   r   r   r   FN)Fr4   )NF) __name__
__module____qualname____doc__r|   vocab_files_namesmodel_input_namesr   r   rR   r   r&   r2   propertyr7   r/   setterrE   rJ   rN   r   rS   rW   r[   rf   intrk   boolrn   r   r   __classcell__)r)   s   @r*   r   r   !   sE       ( (T *$&67
 48
 
 "$sCx.1
 

 
 
 
 
 
:    . . X.     X 
 ! ! !  
  
, , ,8c 8d3i 8 8 8 80 0 0  " " "&? ?QUVYQZ ? ? ? ? sxQ Q9Q3;DI3FQkoQ	cQ Q Q Q Q Q! !c !HSM !]bcf]g ! ! ! ! ! ! ! !r+   r   )r   rv   shutilr   typingr   r   r   r   r   sentencepiecer!   tokenization_utilsr
   utilsr   number_normalizerr   
get_loggerr   ry   r|   r   r   r+   r*   <module>r      s    ' & 				       3 3 3 3 3 3 3 3 3 3 3 3 3 3     5 5 5 5 5 5       6 6 6 6 6 6 
	H	%	%!#34 y! y! y! y! y!+ y! y! y! y! y!r+   