
    g$                         d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZ  e            rddlZ e            rddlZ ej        e          ZddiZd	 Z G d
 de          ZdS )zTokenization class for VITS.    N)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)is_phonemizer_availableis_uroman_availablelogging
vocab_filez
vocab.jsonc                 `    t          j        d          }|                    |           }|d u}|S )Nz[^\x00-\x7F])recompilesearch)input_stringnon_roman_patternmatchhas_non_romans       f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_charactersr   %   s7    
?33 $$\22E%M    c                   4    e Zd ZdZeZddgZ	 	 	 	 	 	 	 d	 d fd
Zed             Z	d Z
d Zd Z	 ddededee         d	eeeeef         f         fdZded	ee         fdZdee         d	efdZd Zd Zddedee         d	eee         df         fdZ xZS )VitsTokenizera  
    Construct a VITS tokenizer. Also supports MMS-TTS.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        language (`str`, *optional*):
            Language identifier.
        add_blank (`bool`, *optional*, defaults to `True`):
            Whether to insert token id 0 in between the other tokens.
        normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the input text by removing all casing and punctuation.
        phonemize (`bool`, *optional*, defaults to `True`):
            Whether to convert the input text into phonemes.
        is_uroman (`bool`, *optional*, defaults to `False`):
            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
    	input_idsattention_mask<pad><unk>NTFreturnc	                 d   t          |d          5 }
t          j        |
          | _        d d d            n# 1 swxY w Y   d | j                                        D             | _        || _        || _        || _        || _	        || _
         t                      j        d|||||||d|	 d S )Nutf-8encodingc                     i | ]\  }}||	S  r&   ).0kvs      r   
<dictcomp>z*VitsTokenizer.__init__.<locals>.<dictcomp>W   s    >>>A1>>>r   )	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uromanr&   )openjsonloadencoderitemsdecoderr-   r.   r/   r0   r1   super__init__)selfr   r+   r,   r-   r.   r/   r0   r1   kwargsvocab_handle	__class__s              r   r9   zVitsTokenizer.__init__H   s    *w/// 	3<9\22DL	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 ?>););)=)=>>> """" 		
		
 		
 		
 		
 		
 		
 		
s   9= =c                 *    t          | j                  S N)lenr5   )r:   s    r   
vocab_sizezVitsTokenizer.vocab_sizej   s    4<   r   c                 |      fdt           j                  D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r&   )convert_ids_to_tokens)r'   ir:   s     r   r*   z+VitsTokenizer.get_vocab.<locals>.<dictcomp>o   s)    RRRa++A..RRRr   )rangerA   updateadded_tokens_encoder)r:   vocabs   ` r   	get_vocabzVitsTokenizer.get_vocabn   s@    RRRR5;Q;QRRRT.///r   c                    t          | j                                                  t          | j                                                  z   }d}d}|t	          |          k     rwd}|D ];}|||t	          |          z            |k    r||z  }|t	          |          z  }d} n<|s"|||                                         z  }|dz  }|t	          |          k     w|S )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr5   keysrH   r@   lower)r:   r   all_vocabularyfiltered_textrE   found_matchwords          r   normalize_textzVitsTokenizer.normalize_texts   s    dl//1122T$:S:X:X:Z:Z5[5[[#l####K&  AD		M 12d::!T)MTNA"&KE	 ;  a!6!6!8!88Q #l#### r   c                 H    | j         dk    r|                    dd          }|S )z4Special treatment of characters in certain languagesronu   țu   ţ)r-   replace)r:   texts     r   _preprocess_charzVitsTokenizer._preprocess_char   s'    =E!!<<d++Dr   rY   is_split_into_wordsr/   c                     ||n j         }|r                     |          }                     |          }t          |          rX j        rQt                      st                              d           n(t          j	                    }|
                    |          } j        rNt                      st          d          t          j        |ddddd          }t          j        dd	|          }nG|rEd
                    t%          t'           fd|                                                              }||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize.
            normalize (`bool`, *optional*, defaults to `None`):
                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
                text consists only of lower-case characters.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        NaC  Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` Note `uroman` requires python version >= 3.10Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uromanzEPlease install the `phonemizer` Python package to use this tokenizer.zen-usespeakT)r-   backendstrippreserve_punctuationwith_stressz\s+ rL   c                     | j         v S r?   )r5   )charr:   s    r   <lambda>z8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>   s    TT\=Q r   )r/   rU   rZ   r   r1   r   loggerwarningurUromanromanize_stringr0   r   ImportError
phonemizerr   subjoinrN   filterr_   )r:   rY   r[   r/   r;   rR   uromans   `      r   prepare_for_tokenizationz&VitsTokenizer.prepare_for_tokenization   se   4 "+!6IIDN	 	-&&t,,D--d33#M22 
	Ft~ 
	F&(( 	Fy     & 6 6} E E> 	l*,, k!"ijjj&0  %)   M F63>>MM 	lGGD0Q0Q0Q0QS`)a)a$b$bcciikkMf$$r   c                     t          |          }| j        r6|                     d          gt          |          dz  dz   z  }||ddd<   |}|S )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      rM   N)rN   r.   _convert_id_to_tokenr@   )r:   rY   tokensintersperseds       r   	_tokenizezVitsTokenizer._tokenize   s[    d> 	" 55a889S[[1_q=PQL!'LA!Fr   ru   c                 v    | j         rt          |          dk    r|dd d         }d                    |          S )NrM   rs   rL   )r.   r@   rn   )r:   ru   s     r   convert_tokens_to_stringz&VitsTokenizer.convert_tokens_to_string   s9    > 	"c&kkAooADqD\Fwwvr   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)r5   getr,   )r:   tokens     r   _convert_token_to_idz"VitsTokenizer._convert_token_to_id   s,    |t|'7'7'G'GHHHr   c                 6    | j                             |          S )z=Converts an index (integer) in a token (str) using the vocab.)r7   r{   )r:   indexs     r   rt   z"VitsTokenizer._convert_id_to_token   s    |&&&r   save_directoryfilename_prefixc           	         t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t          |dd          5 }|                    t          j
        | j        d	d
d          dz              d d d            n# 1 swxY w Y   |fS )NzVocabulary path (z) should be a directory-rL   r   wr"   r#   rs   TF)indent	sort_keysensure_ascii
)ospathisdirrf   errorrn   VOCAB_FILES_NAMESr2   writer3   dumpsr5   )r:   r   r   r   fs        r   save_vocabularyzVitsTokenizer.save_vocabulary   s!   w}}^,, 	LLT^TTTUUUFW\\oM_s222QbcoQpp
 

 *cG444 	cGGDJt|ATYZZZ]aabbb	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c }s   4CCC)r   r   NTTTF)r    N)FNr?   )__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr9   propertyrA   rJ   rU   rZ   strboolr   r   r   r   rq   r   rw   ry   r}   rt   r   r   __classcell__)r=   s   @r   r   r   /   s        * *$&67
  
 
 
  
  
  
  
  
D ! ! X!  
  *   Y]?% ?%?%.2?%GOPT~?%	sDcN"	#?% ?% ?% ?%B	c 	d3i 	 	 	 	tCy S    
I I I' ' ' c HSM ]bchilcmoscs]t        r   r   )r   r3   r   r   typingr   r   r   r   r   r   tokenization_utilsr
   utilsr   r   r   rl   rp   rh   
get_loggerr   rf   r   r   r   r&   r   r   <module>r      s/   # "  				 				 : : : : : : : : : : : : : : : : 5 5 5 5 5 5 J J J J J J J J J J   		H	%	%!<0   D D D D D' D D D D Dr   