
    gw                        d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZmZ ddlmZ dd	lmZmZ d
ddddZddiZd Z ej        e          Zd Zi dddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4i d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVi dWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxi dydzd{d|d}d~ddddddddddddddddddddddddddddi dddddddddddddddddddddddddddddddddddddddddddddddddd̜Zi d̈́ e                                D             dd)d+dddddEdEddddМZ ddgZ! G dӄ de          Z"dՄ Z#ddքZ$dׄ Z%	 	 	 ddee&         de'de'de'fdބZ(dee&         fd߄Z)dee&         fdZ*d Z+dS )z!Tokenization classes for Whisper.    N)	lru_cache)ListOptionalTupleUnion   )
AddedTokenPreTrainedTokenizer)logging   )BasicTextNormalizerEnglishTextNormalizerz
vocab.jsonztokenizer.jsonz
merges.txtznormalizer.json)
vocab_filetokenizer_filemerges_filenormalizer_filezopenai/whisper-basei  c            	      \   t          t          t          d          t          d          dz                       t          t          t          d          t          d          dz                       z   t          t          t          d          t          d          dz                       z   } | dd         }d	}t          d
          D ]8}|| vr2|                     |           |                    d
|z              |dz  }9d |D             }t	          t          | |                    S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 ,    g | ]}t          |          S  )chr).0ns     l/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/whisper/tokenization_whisper.py
<listcomp>z$bytes_to_unicode.<locals>.<listcomp>A   s    			Q#a&&			    )listrangeordappenddictzip)bscsr   bs       r    bytes_to_unicoder,   -   s    	U3s88SXX\**++d5TCIIPQM3R3R.S.SSVZ[`adeiajajloptluluxyly[z[zV{V{{  
AAAB	A4[[  B;;IIaLLLIIdQhFA		"			BBr"   c                 ~    t                      }| d         }| dd         D ]}|                    ||f           |}|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r   r   N)setadd)wordpairs	prev_charchars       r    	get_pairsr4   I   sP     EEEQIQRR  		9d#$$$		Lr"   enenglishzhchinesedegermanesspanishrurussiankokoreanfrfrenchjajapanesept
portuguesetrturkishplpolishcacatalannldutchararabicsvswedishititalianid
indonesianhihindififinnishvi
vietnamesehehebrewuk	ukrainianelgreekmsmalayr*   czechroromaniandadanishhu	hungariantatamilno	norwegianththaiururduhrcroatianbg	bulgarianlt
lithuanianlalatinmimaoriml	malayalamcywelshskslovaktetelugufapersianlvlatvianbnbengalisrserbianazazerbaijanisl	slovenianknkannadaetestonianmk
macedonianbrbretoneubasqueis	icelandichyarmeniannenepalimn	mongolianr)   bosniankkkazakhsqalbanianswswahiliglgalicianmrmarathipapunjabisisinhalakmkhmersnshonayoyorubasosomaliaf	afrikaansococcitankageorgianbe
belarusiantgtajiksdsindhigugujaratiamamharicyiyiddishlolaouzuzbekfofaroesehtzhaitian creolepspashtotkturkmennnnynorskmtmaltesesanskritluxembourgishmyanmartibetantagalogmalagasyassamesetatarhawaiianlingalahausabashkirjavanese	sundanese	cantonese)salbmybotlmgastthawlnhabajwsuyuec                     i | ]\  }}||	S r   r   )r   codelanguages      r    
<dictcomp>r      s    >>>.$x>>>r"   r   r   )burmese	valencianflemishhaitianletzeburgeschpushtopanjabi	moldavianmoldovan	sinhalese	castilianmandarin	translate
transcribec                       e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 	 	 d> fd	Zed	e	fd
            Z
d Zd Zd?dededefdZed	ee	         fd            Zd@d	ee	         fdZ	 dAdee	         deee	                  ded	ee	         f fdZd Zd Zd Zd ZdBdZd ZedBd            ZdCd	efdZdDd ZedDd!            Z dBd"efd#Z!d$ Z"	 	 	 	 	 	 	 	 dEd"ed%ed&ed'e#d(ed)ed*ed+ed	ef fd,Z$	 	 	 	 dFd-e%e	ee	         f         d"ed)ed*ed+ed	efd.Z&d/ Z'd@d0ed1ee         d	e(e         fd2Z)dBd3Z*dGd5Z+d6 Z,dHd8efd9Z-d-ee	         d:e	d;e	fd<Z.ed=             Z/ xZ0S )IWhisperTokenizeraa	  
    Construct a Whisper tokenizer.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
    the superclass for more information regarding such methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        normalizer_file (`str`, *optional*):
            Path to the normalizer_file file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
            `"<|startoftranscript|>"` when generating.
        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
            The end of sequence token.
        pad_token (`str`, *optional*):
            The token used for padding, for example when batching sequences of different lengths.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word.
        language (`str`, *optional*):
            The language of the transcription text. The corresponding language id token is appended to the start of the
            sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
            `"<|es|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only.
        task (`str`, *optional*):
            Task identifier to append at the start of sequence (if any). This should be used for mulitlingual
            fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation.
        predict_timestamps (`bool`, *optional*, defaults to `False`):
            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
    	input_idsattention_maskNreplace<|endoftext|>Fc           
      z   t          |t                    rt          |dddd          n|}t          |t                    rt          |dddd          n|}t          |t                    rt          |dddd          n|}t          |t                    rt          |dddd          n|}t          |d          5 }t	          j        |          | _        d d d            n# 1 swxY w Y   d | j                                        D             | _        || _	        t                      | _        d | j                                        D             | _        t          |d          5 }|                                                    d          d	d
         }d d d            n# 1 swxY w Y   d |D             }t          t!          |t#          t%          |                                        | _        i | _        |	| _        |Ct          |d          5 }t	          j        |          | _        d d d            n# 1 swxY w Y   nd | _        t/          j        d          | _        t/          j        d          | _        |
| _         t9                      j        d||||||	d| || _        || _        d S )NFT)lstriprstrip
normalizedspecialutf-8encodingc                     i | ]\  }}||	S r   r   r   kvs      r    r   z-WhisperTokenizer.__init__.<locals>.<dictcomp>%  s    >>>A1>>>r"   c                     i | ]\  }}||	S r   r   r  s      r    r   z-WhisperTokenizer.__init__.<locals>.<dictcomp>(  s    HHHdaQHHHr"   
r   c                 P    g | ]#}t          |                                          $S r   )tuplesplit)r   merges     r    r!   z-WhisperTokenizer.__init__.<locals>.<listcomp>+  s(    CCCueEKKMM**CCCr"   zJ's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+z<\|(\d+\.\d+)\|>)errors	unk_token	bos_token	eos_token	pad_tokenadd_prefix_spacer   ) 
isinstancestrr	   openjsonloadencoderitemsdecoderr%  r,   byte_encoderbyte_decoderreadr#  r'   r(   r$   len	bpe_rankscacher*  english_spelling_normalizerrecompilepattimestamp_patr   super__init__taskpredict_timestamps)selfr   r   r   r%  r&  r'  r(  r)  r*  r   r@  rA  kwargsvocab_handlemerges_handle
bpe_merges	__class__s                    r    r?  zWhisperTokenizer.__init__   s   $ )S))JyuX\]]]] 	 )S))JyuX\]]]] 	 )S))JyuX\]]]] 	 )S))JyuX\]]]] 	 *w/// 	3<9\22DL	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3>>););)=)=>>>,..HHd.?.E.E.G.GHHH+000 	@M&++--33D99!B$?J	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@CC
CCC
c*eC
OO.D.DEEFF
 0&o888 KL379\3J3J0K K K K K K K K K K K K K K K 04D, :oppZ(;<<  	
-	
 	
 	
 	
 	
 	"4s6   ?C%%C),C)+0F''F+.F+IIIreturnc                 *    t          | j                  S N)r6  r0  )rB  s    r    
vocab_sizezWhisperTokenizer.vocab_sizeH  s    4<   r"   c                 |      fdt           j                  D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r   )convert_ids_to_tokens)r   irB  s     r    r   z.WhisperTokenizer.get_vocab.<locals>.<dictcomp>M  s)    RRRa++A..RRRr"   )r$   rK  updateadded_tokens_encoder)rB  vocabs   ` r    	get_vocabzWhisperTokenizer.get_vocabL  s@    RRRR5;Q;QRRRT.///r"   c                 f    | j         v r j         |         S t          |          }t          |          }|s|S 	 t          | fd          }| j        vrn8|\  }}g }d}|t          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                     Y nw xY w||         |k    rC|t          |          dz
  k     r-||dz            |k    r|	                    ||z              |dz  }n |	                    ||                    |dz  }|t          |          k     t          |          }|}t          |          dk    rnt          |          }Wd
                    |          }| j         |<   |S )NTc                 T    j                             | t          d                    S )Ninf)r7  getfloat)pairrB  s    r    <lambda>z&WhisperTokenizer.bpe.<locals>.<lambda>\  s     1C1CD%PU,,1W1W r"   keyr   r       )r8  r"  r4   minr7  r6  indexextend
ValueErrorr&   join)
rB  tokenr0   r1   bigramfirstsecondnew_wordrO  js
   `         r    bpezWhisperTokenizer.bpeR  s   DJ:e$$U||$ 	L	($W$W$W$WXXXFT^++"ME6HAc$ii--

5!,,A
 OOD1I...AA "   OODH---E 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA c$ii--  XHD4yyA~~!$9	(: xx~~ 
5s   7B- -'CCr   r@  rA  c                 f    ||n| j         | _         ||n| j        | _        ||n| j        | _        dS )a  
        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
        update the prefix tokens as required when fine-tuning. Example:

        ```python
        >>> # instantiate the tokenizer and set the prefix token to Spanish
        >>> tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
        >>> # now switch the prefix token from Spanish to French
        >>> tokenizer.set_prefix_tokens(language="french")
        ```

        Args:
            language (`str`, *optional*, defaults to `None`):
                The language of the transcription text.
            task (`str`, *optional*, defaults to `None`):
                Task identifier to append at the start of sequence (if any).
            predict_timestamps (`bool`, *optional*, defaults to `None`):
                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
        N)r   r@  rA  )rB  r   r@  rA  s       r    set_prefix_tokensz"WhisperTokenizer.set_prefix_tokens|  sE    ( %-$8dm ,DD$)	8J8V"4"4\`\sr"   c           	      6   |                      d          }|                      d          }|                      d          }|                      d          }t          t                                                    }| j        | j                                        | _        | j        t          v rt          | j                 }n| j        t                                          v r| j        }nt          | j                  dk    }t          d| j         d|r&t          t                                                    n%t          t                                                     d          | j        -| j        t          vrt          d	| j         d
t                     |g}| j        .|                    |dz   |                    |          z              | j        "|                    | j        dk    r|n|           | j        s|                    |           |S )N<|startoftranscript|>z<|translate|>z<|transcribe|><|notimestamps|>r]  zUnsupported language: z. Language should be one of: .zUnsupported task: z. Task should be in: r   r  )convert_tokens_to_idsr"  	LANGUAGESkeysr   lowerTO_LANGUAGE_CODEvaluesr6  rb  r#   r@  TASK_IDSr&   r`  rA  )	rB  bos_token_idtranslate_token_idtranscribe_token_idnotimestamps_token_idlangslanguage_idis_language_codebos_sequences	            r    prefix_tokenszWhisperTokenizer.prefix_tokens  s   112IJJ!77HH"889IJJ $ : :;M N Ninn&&''=$ M//11DM} 000.t}="2"9"9";";;;"m#&t}#5#5#:  rT] r r;Kn-4466777QUVfVkVkVmVmQnQnr r r  
 9 y(( !`di!`!`V^!`!`aaa$~=$q 05;;{3K3K KLLL9 tyL7P7P 3 3Vhiii& 	7 5666r"   c                 X    || j         |z   | j        gz   S | j         |z   |z   | j        gz   S )z=Build model inputs from a sequence by appending eos_token_id.)r  eos_token_id)rB  token_ids_0token_ids_1s      r     build_inputs_with_special_tokensz1WhisperTokenizer.build_inputs_with_special_tokens  s?    %3t7H6III!K/+=AR@SSSr"   r  r  already_has_special_tokensc                    |r$t                                          ||d          S dgt          | j                  z  }dg}||dgt          |          z  z   |z   S |dgt          |          z  z   dgt          |          z  z   |z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r  r  r  r   Nr   )r>  get_special_tokens_maskr6  r  )rB  r  r  r  prefix_onessuffix_onesrG  s         r    r  z(WhisperTokenizer.get_special_tokens_mask  s    & & 	7722'[]a 3    cC 2333c1#K(8(8"89KGGqcC$4$445!s;?O?O9OPS^^^r"   c                 8    g }t          j         j        |          D ]{}d                     fd|                    d          D                       }|                    d                      |                              d          D                        ||S )zTokenize a string. c              3   2   K   | ]}j         |         V  d S rJ  )r3  )r   r+   rB  s     r    	<genexpr>z-WhisperTokenizer._tokenize.<locals>.<genexpr>  s<        )*!!$     r"   r  c              3      K   | ]}|V  d S rJ  r   )r   	bpe_tokens     r    r  z-WhisperTokenizer._tokenize.<locals>.<genexpr>  s"      TTIiTTTTTTr"   r^  )r:  findallr<  rc  encodera  rj  r#  )rB  text
bpe_tokensrd  s   `   r    	_tokenizezWhisperTokenizer._tokenize  s    
Z$// 	U 	UEGG    .3ll7.C.C    E TT%9N9Ns9S9STTTTTTTr"   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)r0  rW  r&  )rB  rd  s     r    _convert_token_to_idz%WhisperTokenizer._convert_token_to_id  s,    |t|'7'7'G'GHHHr"   c                 8    | j                             |d          S )z
        Converts an index (integer) in a token (str) using the vocab. Whisper's base tokenizer always decodes OOV
        tokens as "", thus we do not use the `unk_token` here.
        r  )r2  rW  )rB  r`  s     r    _convert_id_to_tokenz%WhisperTokenizer._convert_id_to_token  s    
 |r***r"   c                 T    t          j        d           |                     |          S )NzThe private method `_normalize` is deprecated and will be removed in v5 of Transformers.You can normalize an input string using the Whisper English normalizer using the `normalize` method.)warningswarn	normalize)rB  r  s     r    
_normalizezWhisperTokenizer._normalize  s0    s	
 	
 	
 ~~d###r"   c                 X    t          j        d           |                     ||          S )NzThe private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers.You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method.remove_diacritics)r  r  basic_normalize)rB  r  r  s      r    _basic_normalizez!WhisperTokenizer._basic_normalize  s8    w	
 	
 	
 ##D<M#NNNr"   c                 @    t          | j                  } ||          S )z
        Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
        english text.
        )r   r9  )rB  r  
normalizers      r    r  zWhisperTokenizer.normalize  s$    
 +4+KLL
z$r"   c                 8    t          |          } ||           S )z
        Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
        multilingual text.
        r  )r   )r  r  r  s      r    r  z WhisperTokenizer.basic_normalize
  s%     );LMMM
z$r"   {Gz?c                 r     j         d         dz   }g g}d}d}|D ]x}||k    rUt          ||z
  |z            }	|	|k     r||z  }|	}|                    d|	|z   dd           |                    g            ]|d                             |           y fd|D             }d                    |          S )	z
        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
        r   r           z<|z.2fz|>c                 l    g | ]0}t          |t                    r|n                    |           1S )skip_special_tokens)r+  r,  decode)r   srB  r  s     r    r!   z<WhisperTokenizer._decode_with_timestamps.<locals>.<listcomp>,  sJ     
 
 
efAs##`AAQL_)`)`
 
 
r"   r  )all_special_idsrX  r&   rc  )
rB  	token_idsr  time_precisiontimestamp_beginoutputscur_max_timestampprev_segments_lenrd  	timestamps
   ` `       r    _decode_with_timestampsz(WhisperTokenizer._decode_with_timestamps  s   
 .r2Q6$ 	* 	*E''!5?#:n"LMM	000%)::%$-!KY1B%BKKKKLLLr""""""5))))
 
 
 
 
jq
 
 
 wwwr"   c                    g }dt          t          |                    v r8t          |d          r(t          |j                  r|                                }t          j        |          }|j        d         dk    r't          |j                  dk    rt          d          | j
        d         dz   }||k    }t          j        |dd         |dd         z            d         dz   }|j        d         dk    r|                                dk    rg S t          j        |          d         d         dz   |vr6t          j        |t          j        |          d         d         dz             }t          j        |          d         d         }d}d}	|D ]}
|||
         }t          |          dk    r|d                                         |z
  }|d                                         |z
  }||k     r|	|z  }	|}|                     |          }|                     |          }|                     |          }|                    |||	z   |z  ||	z   |z  fd           |
}|S )	a~  
        Compute offsets for a given tokenized input

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        torchcpur   r   z)Can only process a single input at a timer   Nr  r  )r,  typehasattrcallabler  nparrayshaper6  rb  r  wheresumr&   item_preprocess_token_ids_decode_filter_timestamp_ids)rB  r  r  offsetsr  timestamp_tokensconsecutive
last_slicer  r  current_slicesliced_tokensstart_timestamp_positionend_timestamp_positionr  s                  r    _compute_offsetsz!WhisperTokenizer._compute_offsets1  s    c$y//****	50I0I*hW`WdNeNe*!IHY''	?1!!c)/&:&:Q&>&>HIII.r2Q6$7h/47G7KKLLQORSSQ1$$)9)=)=)?)?1)D)DIX&''*2.2+EE)K:J1K1KA1Nr1RUV1VWWKX.//215
( 	' 	'M%j&>?M=!!A%%+8+;+@+@+B+B_+T()6r):)?)?)A)AO)S&+.???%)::%$:! !% : := I I||M2211$77 $58II^[36GG>Y&    'JJr"   c                 `    |                      fdt          d          D                       S )a  
        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.

        Args:
            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
        c                      g | ]
}d |z  z  S )z<|%.2f|>r   )r   rO  r  s     r    r!   z2WhisperTokenizer.timestamp_ids.<locals>.<listcomp>v  s$    *h*h*hSTJ!n:L,M*h*h*hr"   i  )rq  r$   )rB  r  s    `r    timestamp_idszWhisperTokenizer.timestamp_idsm  s8     ))*h*h*h*hX]^fXgXg*h*h*hiiir"   r  c                     |rA|                      d          }|                      d          }|                     |||          }|S )a  
        Pre-process the token ids for decoding by removing the prompt tokens ids and timestamp token ids.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Typically, obtained using the `__call__` method of the tokenizer.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
                removed.
        <|startofprev|>rn  )rq  _strip_prompt)rB  r  r  prompt_token_iddecoder_start_token_ids        r    r  z&WhisperTokenizer._preprocess_token_idsx  sT      	_"889JKKO%)%?%?@W%X%X"**9oG]^^Ir"   c                 8    t          j        | j        d|          S )Nr  )r:  subr=  )rB  r  s     r    r  z&WhisperTokenizer._filter_timestamp_ids  s    vd("i888r"   clean_up_tokenization_spacesoutput_offsetsr  decode_with_timestampsr  r  r  c
           	         |                      ||          } t                      j        |f|||||	d|
}|r|                     |||          }n|                     |          }|r|                     ||          }||dS |S )a	  
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding. Will remove the previous tokens (pre-prompt)
                if present.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            output_offsets (`bool`, *optional*, defaults to `False`):
                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
                timestamps. If there are previous tokens (pre-prompt) to decode, they will only appear in the decoded
                text if they contain timestamp tokens.
            time_precision (`float`, *optional*, defaults to 0.02):
                The time ratio to convert from token to time.
            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
                Whether or not to decode with timestamps included in the raw text.
            normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
                target text is in English. Otherwise, the basic text normalizer should be applied.
            basic_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
                target text.
            remove_diacritics (`bool`, *optional*, defaults to `False`):
                Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
                destroy information in the decoded text, hence it should be used with caution.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.
        Returns:
            `str`: The decoded sentence.
        r  )r  r  r  r  r  )r  r  )r  )r  r  )r  r>  r  r  r  r  )rB  r  r  r  r  r  r  r  r  r  rC  filtered_idsr  r  rG  s                 r    r  zWhisperTokenizer.decode  s    d 11 3 2 
 

 uww~
 3)E+/
 
 
 
 " 	4//^Qd 0  DD --d33D  	6++In+UUG W555r"   r  c                 :   |                     dd          | _        |                     ||          }g }g }	|D ]n}
|r
|
| j        v r|
| j        v rB|	r*|                    |                     |	                     g }	|                    |
           Y|	                    |
           o|	r(|                    |                     |	                     d                    |          }|r|                     |          }|S |r| 	                    ||          }|S |S )Nuse_source_tokenizerFr  r  r  )
pop_decode_use_source_tokenizerrN  r  rQ  r&   convert_tokens_to_stringrc  r  r  )rB  r  r  r  r  r  rC  filtered_tokens	sub_textscurrent_sub_textrd  r  
clean_texts                r    r  zWhisperTokenizer._decode  s]    -3JJ7Mu,U,U)44YTg4hh
 	$ 		/ 		/E" u0D'D'D111# *$$T%B%BCS%T%TUUU')$  '''' ''.... 	NT::;KLLMMMwwy!! 	--J 	--dFW-XXJKr"   c                      d                     |          }t           fd|D                                           d j                  }|S )z:Converts a sequence of tokens (string) in a single string.r  c                 *    g | ]}j         |         S r   )r4  )r   crB  s     r    r!   z=WhisperTokenizer.convert_tokens_to_string.<locals>.<listcomp>  s!    ===1$+A.===r"   r  )r%  )rc  	bytearrayr  r%  )rB  tokensr  s   `  r    r  z)WhisperTokenizer.convert_tokens_to_string  sP    wwv=======>>EEgVZVaEbbr"   save_directoryfilename_prefixc           	         t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t          |dd	
          5 }|                    t          j
        | j        ddd          dz              d d d            n# 1 swxY w Y   d}t          |dd	
          5 }|                    d           t          | j                                        d           D ][\  }	}
||
k    r t                              d| d           |
}|                    d                    |	          dz              |dz  }\	 d d d            n# 1 swxY w Y   | j        ]t          |dd	
          5 }|                    t          j
        | j        ddd          dz              d d d            n# 1 swxY w Y   |||fS )NzVocabulary path (z) should be a directory-r  r   r   r   wr  r  r]  TF)indent	sort_keysensure_asciir  r   z#version: 0.2
c                     | d         S )Nr   r   )kvs    r    rZ  z2WhisperTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^ r"   r[  zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r^  r   )ospathisdirloggererrorrc  VOCAB_FILES_NAMESr-  writer.  dumpsr0  sortedr7  r1  warningr9  )rB  r  r  r   
merge_filer   fr`  writerr  token_indexs              r    save_vocabularyz WhisperTokenizer.save_vocabulary  sf   w}}^,, 	LLT^TTTUUUFW\\oM_s222QbcoQpp
 

 W\\oM_s222QbcpQqq
 

 ',,oM_s222QbctQuu
 
 *cG444 	cGGDJt|ATYZZZ]aabbb	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c *cG444 
	LL*++++1$.2F2F2H2HN^N^+_+_+_  '
KK''NNM
 M M M   (ESXXj11D8999

	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 +7osW=== Jt?UYhmnnnquu                
 :66s7   14D11D58D5BG88G<?G<4II #I c                 R    |                     d| j                  }|s|rd|z   }||fS )Nr*  r^  )r  r*  )rB  r  is_split_into_wordsrC  r*  s        r    prepare_for_tokenizationz)WhisperTokenizer.prepare_for_tokenization2  s=    !::&8$:OPP 	"2 	:Df~r"   Tc                     |                      |||            | j        dd          }d t          |          D             }|S )N)r@  r   rA  r   c                 "    g | ]\  }}|d z   |fS )r   r   )r   rankrd  s      r    r!   z;WhisperTokenizer.get_decoder_prompt_ids.<locals>.<listcomp>?  s%    \\\KD%tax/\\\r"   )rl  r  	enumerate)rB  r@  r   no_timestampsforced_tokensforced_decoder_idss         r    get_decoder_prompt_idsz'WhisperTokenizer.get_decoder_prompt_ids8  sV    D8TaPabbb
 *122.\\9]C[C[\\\!!r"   c                *    t          | ||||          S )N)return_timestampsreturn_languager  )_decode_asr)rB  model_outputsr  r  r  s        r    r  zWhisperTokenizer._decode_asrB  s(    /+)
 
 
 	
r"   r  r  c                 2      dd|                                 z   d          }|d         dd         }t           fd|D             d          }|(                     |          }t          d	| d
          |                    |           |d         S )z`Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`].r  r^  F)add_special_tokensr  r   Nc              3   >   K   | ]}|j         d          k    |V  dS )r   N)r  )r   xrB  s     r    r  z2WhisperTokenizer.get_prompt_ids.<locals>.<genexpr>Q  s6       \ \qqDDXYZD[?[?[?[?[?[?[ \ \r"   zJEncountered text in the prompt corresponding to disallowed special token: rp  )tensor_type)stripnextrN  rb  convert_to_tensors)rB  r  return_tensorsbatch_encodingprompt_text_idsspecial_token_idrd  s   `      r    get_prompt_idszWhisperTokenizer.get_prompt_idsK  s    /tzz||1CX]^^^ )5abb9 \ \ \ \O \ \ \^bcc'../?@@Erjorrrsss))n)EEEk**r"   r  r  c                     t          |t                    s|                     |          }|s|S |d         |k    }|r#||v r||                    |          d          S g S |S )Nr   )r+  r#   _convert_to_listr`  )rB  r  r  r  
has_prompts        r    r  zWhisperTokenizer._strip_promptY  s    )T** 	9--i88I  	q\_4
 	%22 1G!H!H!J!JKK	r"   c                    t          | d          rxdt          t          |                     v r'|                                                                 } nedt          t          |                     v r|                                 } n2dt          t          |                     v r|                                 } t          | t          j                  r|                                 } | S )Nnumpyr  
tensorflowjaxlib)	r  r,  r  r  r(  tolistr+  r  ndarray)r  s    r    r%  z!WhisperTokenizer._convert_to_listk  s     9g&& 	+#d9oo....%MMOO1133		T)__!5!555%OO--	T)__----!((**Ii,, 	+!((**Ir"   )
Nr  r  r  r  NFNNF)NNNrJ  )NF)F)Fr  )r  )FNFr  FFFF)FFFF)NNT)r  )1__name__
__module____qualname____doc__r  vocab_files_namesmodel_input_namesr?  propertyintrK  rS  rj  r,  boolrl  r   r  r  r   r  r  r  r  r  r  r  staticmethodr  r  r  r   r  r  r  rX  r  r   r  r  r   r  r	  r  r  r#  r  r%  __classcell__)rG  s   @r    r  r     s       & &P *$&67 !!! H5 H5 H5 H5 H5 H5T !C ! ! ! X!  ( ( (Tt t# tC t\` t t t t0 tCy    XDT TQUVYQZ T T T T sx_ _9_3;DI3F_ko_	c_ _ _ _ _ _>  I I I+ + +$ $ $O O O O            \    dg        <: : : :x j j j Yj D    $9 9 9 %*-1$ $', %"'L L "L '+	L
 L L !%L L L  L 
L L L L L Lb %* %"'' 'd3i(' "' 	'
 '  ' 
' ' ' 'T  $7 $7c $7HSM $7]bcf]g $7 $7 $7 $7N   " " " "
 
 
+ +3 + + + +tCy 3 `c    $   \    r"   r  c          	      b
  ) d))fd}g } |            }d}|                      d          dz   }	g }
g }d}d}t          | j                  }|                      d          }|                      d          }t          |          D ]\  }}|d	         d
                                         }|                     |||          }|dk    r |d         d
                                         }d}|	}d|v rL|d         \  }}}||z  }||z
  }|r||z  |	z   }|r*t          |          D ]}||	k    r|||	z
  |z  |k     r n|}g }g }t          |          D ]Q\  }}||v r|                     |g          }|dd         }t          	                    |d          }|s)ri|)k    rc|sa|

                    |           t          |
          } |                     |           }!|!|d<   |
                    |           g }
g } |            }||d<   |)||	k    r||	z
  |z  |z   }"t          |"d          }"|r	||k    rd}|s|
r	||k     rd}|d         d
         |"|d         d
<   |"|d         d
         k    r$|"|d         d<   |

                    |           |dk    r|
                    |           t          |
|          \  } }#|                     |           }!|!|d<   |dk    rt          | | |#)|          |d<   |
                    |           g }
g }g }g } |            }|
                    |           |dk    ret          ||         |z   d          }$|dz   t          |          k     rt          ||dz            |z   d          }%nd}%|
                    |$|%f           Sd|v r|||z
  z  }|r2|

                    |           |dk    r|
                    |           ]t          d |
D                       s |            }g }
g }g }g }|
rz|rt                               d           t          |
|          \  } }#|                     |           }!|!|d<   |dk    rt          | | |#)|          |d<   |
                    |           d                    d |D                       }&|s|r~|D ]I}|s|                    d           nt)          |d                   |d<   |s|                    d           J|dk    r'g }'|D ]}|'                    |d                    d|'i}(nd|i}(ni }(|&|(fS )z
    Internal method meant to only be used by asr pipeline. Handles all the little quirks specific to whisper to handle
    the various options not allowed in other seq2seq models
    Nc                       d d gddS )Nr  )r   r  r  r   )last_languages   r    	new_chunkz_decode_asr.<locals>.new_chunk  s    )tbQQQr"   r  ro  r   Fr  rn  r  r   r0   token_timestampsstrider]  r  r   Tr  wordsc              3      K   | ]}|V  d S rJ  r   )r   ps     r    r  z_decode_asr.<locals>.<genexpr>0  s"      11Aa111111r"   zWhisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.r  c              3   &   K   | ]}|d          V  dS )r  Nr   )r   chunks     r    r  z_decode_asr.<locals>.<genexpr>J  s&      ::%f::::::r"   chunks)rq  r.   r  r  r+  r  reversedr  rr  rW  r&   _find_longest_common_sequenceround_collate_word_timestampsr6  anyr  r  rc  r  r"  ra  )*	tokenizerr  r  r  r  r;  rD  rC  time_offsetr  previous_tokensprevious_token_timestampsskipright_stride_startr  r  r  chunk_idoutputr  r<  last_timestampfirst_timestamp	chunk_lenstride_leftstride_rightrd  current_tokenscurrent_token_timestampsrO  r  r   resolved_tokensresolved_texttimeresolved_token_timestamps
start_timeend_time	full_text
new_chunksoptionalr:  s*                                            @r    r  r  {  s#   $ MR R R R R FIKKEK556HIIAMOO "D)344O556GHHO&<<=TUU%m44 U* U*& 8$Q'..00	++IH^__	&&%&89!<CCEE
 )v39(3C0I{L;&K!*\!9  Q"-">"P 
/%i00 	/ 	/E// +6!&!8N JM_ _ _!E).#%  "),, [	L [	LHAu '' ''00AbDz$==t44' % 
,])B)BK\)B'..~>>>*G*X*X(1(8(8(I(I(5fe,,, +-)+ )	(0E*%$,MM /))/>AKOT1~~! +,e~&=&=  DD $,o $,%/2I2I DD;'*2,0E+&q)) u[1!444 04k*1-'..~>>>,665<<=UVVVEb+-FF FB)B )2(8(8(I(I(5f,66-E )?<UWdfu. .E'N e,,, +-)+461350 )	
 %%e,,,$..!&'7':['H!!L!LJ1us#34444#()9!a%)@;)NPQ#R#R#',33Z4JKKKv9|33K  		*"">222 F**)001IJJJ1111111 	*IKKE ON(*%')$  	NN]  
 6S66
 6
22 "((99%f&&5?,E}Ve E'N 	e ::6:::::I O  	& 	&E$ ?		+&&&&%*5+=%>%>k"" &		*%%%&&J 2 2!!%.1111 *-HH &)HHhr"   c           
         | d         }t          |          }g }r
d         g }t          | dd                    D ]\  }d}||ddf}t          |          }	t          d||	z             D ]}
|
dz  }t          d||
z
            t	          |||	z   |
z
            }t          j        ||                   }t          d|
|z
            t	          |	|
          }t          j        ||                   t          |          t                    k    rt          d          r.t          fdt          |          D                       }nt          j        |k              }||
z  |z   }|dk    r||k    r|}||f}!|\  }}|z   dz  }|z   dz  }|	                    |d |                    ||d          }t          |          }r0|	                    d |                    dz            |d          |	                    |           |S t                    dk    r|	                               ||fS |g fS )Nr   r   r  g     @ziThere is a bug within whisper `decode_asr` function, please report it. Dropping to prevent bad inference.c              3   |   K   | ]6\  }}||         k    r%|z            d z            |z            k    2d V  7dS )r   Nr   )	r   idxelem
left_startleft_token_timestamp_sequencerightright_startseq_idxtoken_timestamp_sequencess	      r    r  z0_find_longest_common_sequence.<locals>.<genexpr>  s~        !Tc
**9*s:JK4Wq[A+PSBSTU U	 U U U U r"   r]  )
r6  r  r$   maxr_  r  r  RuntimeErrorr  ra  )	sequencesrk  left_sequenceleft_lengthtotal_sequencetotal_token_timestamp_sequenceright_sequencemax_max_indicesright_lengthrO  eps	left_stopleft
right_stopmatchesmatchingleft_mid	right_midrf  rg  rh  ri  rj  s    `                 @@@@@r    rF  rF  `  s    aLMm$$KN  ,(A!(D%)+&#,Yqrr]#;#; [_ [_"KA6< >**q+455 (	O (	OAg+C
 Qa00JK|)Ca)GHHI8M*Y*>?@@Da[11K\1--JH^K
,BCDDE 4yyCJJ&&"   ) 0          %.t__     &//{S(H{{x$)9k:N;F8YZ 
*q0+-!3	mIXI6777&yzz2-(($ 	_*112OPYQYPY2Z[[[,EgPQk,RS\S]S],^)-((( (
$%%))&--.KLLL===r!!r"   c                 ~   	 t          | ||          \  }}}|rd|ini 		fdt          ||          D             }|S )Nr   c                 p    g | ]2\  }}||d                   d          |d                  d         fd3S )r   r   r   r  r   )r   r0   indicesoptional_language_fieldr<  s      r    r!   z,_collate_word_timestamps.<locals>.<listcomp>  sj        D'	 *71:6q9;KGTVK;XYZ;[\	
 	
 &	
  r"   )_combine_tokens_into_wordsr(   )
rJ  r  r<  r   r  r?  _token_indicestimingsr  s
     `      @r    rH  rH    sq    8FHUUE1m8GOz844R     !66  G Nr"      "'“¡¿([{-   "'.。,，!！?？:：”)]}、r  r   prepend_punctuationsappend_punctuationsc                     || j         }|d}|dv rt          | |          \  }}}nt          | |          \  }}}t          |||||           |||fS )z
    Groups tokens by word. Returns a tuple containing a list of strings with the words, and a list of `token_id`
    sequences with the tokens making up each word.
    Nr6   >   r   rq   r8   r   rD   r   )r   _split_tokens_on_unicode_split_tokens_on_spaces_merge_punctuations)rJ  r  r   r  r  r?  word_tokensr  s           r    r  r    s}     %QQQ,DYPV,W,W){MM,CIv,V,V){M{M;OQdeee+},,r"   c                    |                      |d          }d}g }g }g }g }g }d}	t          |          D ]\  }
}|                    |           |                    |
           |                      |d          }||vs"||	|                    |          z            |k    rU|                    |           |                    |           |                    |           g }g }|	t	          |          z  }	|||fS )zlCombine tokens into words by splitting at any position where the tokens are decoded as valid unicode points.T)r  u   �r   )r  r  r&   r`  r6  )rJ  r  decoded_fullreplacement_charr?  r  r  rW  current_indicesunicode_offset	token_idxrd  decodeds                r    r  r    s,   ##F4#HHLEKMNON%f-- + +	5e$$$y)))"">$"OO G++NW]];K-L-LLMQaaaLL!!!~...  111N Oc'll*N+},,r"   c                 "   t          | |          \  }}}g }g }g }t          |||          D ]\  }}	}
|	d         | j        k    }|                    d          }|                                dv }|s|s|st          |          dk    r@|                    |           |                    |	           |                    |
           |d         |z   |d<   |d                             |	           |d                             |
           |||fS )zLCombine tokens into words by splitting at whitespace and punctuation tokens.r   r^  z !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~r   )r  r(   r  
startswithr  r6  r&   ra  )rJ  r  subwordssubword_tokens_listsubword_indices_listr?  r  r  subwordsubword_tokenssubword_indicesr  
with_spacepunctuations                 r    r  r  !  s;   :RS\^d:e:e7H!#7EKM47BUWk4l4l 6 60 #y'=='',,
mmoo)MM 	6j 	6K 	63u::??LL!!!~...  1111b	G+E"IO"">222"$$_5555+},,r"   c                 0   t          |           dz
  }t          |           dz
  }|dk    r| |                             d          rh| |                                         |v rL| |         | |         z   | |<   ||         ||         z   ||<   ||         ||         z   ||<   d| |<   g ||<   g ||<   n|}|dz  }|dk    d}d}|t          |           k     r| |                             d          s\| |         |v rR| |xx         | |         z  cc<   ||xx         ||         z  cc<   ||xx         ||         z  cc<   d| |<   g ||<   g ||<   n|}|dz  }|t          |           k     d | D             | dd<   d |D             |dd<   d	 |D             |dd<   dS )
z1Merges punctuation tokens with neighboring words.r]  r   r   r^  r  c                     g | ]}||S r   r   )r   r0   s     r    r!   z'_merge_punctuations.<locals>.<listcomp>Z  s    ///$////r"   Nc                     g | ]}||S r   r   )r   rd  s     r    r!   z'_merge_punctuations.<locals>.<listcomp>[  s    4445e4444r"   c                     g | ]}||S r   r   )r   rd  s     r    r!   z'_merge_punctuations.<locals>.<listcomp>\  s    000#C0#000r"   )r6  r  r  endswith)r?  r  r  	prependedappendedrO  ri  s          r    r  r  9  s    	E

QAE

QA
q&&8s## 	a(8(8I(E(EQx%(*E!Hq	F1I-F1I gaj0GAJE!HF1IGAJJA	Q q&& 	
A	A
c%jj..Qx  %% 	%(h*>*>!HHHa HHH1III"IIIAJJJ'!*$JJJE!HF1IGAJJA	Q c%jj.. 0////E!!!H44F444F111I00000GAAAJJJr"   rJ  )Nr  r  ),r0  r.  r  r  	functoolsr   typingr   r   r   r   r(  r  regexr:  tokenization_utilsr	   r
   utilsr   english_normalizerr   r   r  MAX_MODEL_INPUT_SIZESr,   
get_loggerr-  r  r4   rr  r1  ru  rw  r  r  rF  rH  r4  r,  r  r  r  r  r   r"   r    <module>r     s   ( '  				        / / / / / / / / / / / /         A A A A A A A A       J J J J J J J J &(	   3   0 
	H	%	%  e)e)e 	(e 	)	e
 	)e 	(e 	(e 	*e 	,e 	)e 	(e 	)e 	'e 	(e 	)e  	)!e" 	,#e e$ 	'%e& 	)'e( 	,)e* 	(+e, 	+-e. 	'/e0 	'1e2 	'3e4 	*5e6 	(7e8 	+9e: 	';e< 	+=e> 	&?e@ 	&AeB 	*CeD 	+Ee e eF 	,GeH 	'IeJ 	'KeL 	+MeN 	'OeP 	(QeR 	(SeT 	)UeV 	)WeX 	)YeZ 	)[e\ 	-]e^ 	+_e` 	)aeb 	*ced 	,eef 	(ge e eh 	(iej 	+kel 	*men 	(oep 	+qer 	)set 	(uev 	*wex 	)yez 	*{e| 	)}e~ 	)e@ 	)AeB 	'CeD 	'EeF 	(GeH 	(Ie e eJ 	+KeL 	)MeN 	*OeP 	,QeR 	'SeT 	(UeV 	*WeX 	)YeZ 	)[e\ 	%]e^ 	'_e` 	)aeb 	
ced 	(eef 	)geh 	)iej 	)ke el 











Ie e e	P>>IOO,=,=>>>     &f
 f
 f
 f
 f
* f
 f
 f
Rb b bJu" u" u" u"p  &  0A- -I- - 	-
 - - - -4-S	 - - - ->-tCy - - - -0#1 #1 #1 #1 #1r"   