
    g<t                    p   d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmc mZ d dlmZ d dlmZ ddlmZmZ ddlmZmZmZmZmZ ddlmZ dd	lm Z  dd
l!m"Z" ddl#m$Z$m%Z%  e"j&        e'          Z(dej)        de*dej)        fdZ+dej,        fdZ-d Z.	 	 	 	 ddZ/ G d de          Z0dS )    N)CallableIteratorListOptionalTupleUnion)nn)EncoderDecoderCache   )GenerationConfigGenerationMixin)LogitsProcessorList$SuppressTokensAtBeginLogitsProcessorSuppressTokensLogitsProcessorWhisperNoSpeechDetectionWhisperTimeStampLogitsProcessor)StoppingCriteriaList)BaseModelOutput)logging   )TASK_IDSTO_LANGUAGE_CODEinputsfilter_widthreturnc                 ,   |dk    s	|dz  dk    rt          d          |dz  }| j        d         |k    r| S t          j                            | ||ddfd          } |                     d|d                                          d         d|f         }|S )	z
    Applies a median filter of width `filter_width` along the last dimension of the input.

    The `inputs` tensor is assumed to be 3- or 4-dimensional.
    r      r   z&`filter_width` should be an odd numberreflect)mode.)
ValueErrorshaper	   
functionalpadunfoldsort)r   r   	pad_widthresults       j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/whisper/generation_whisper.py_median_filterr*   -   s     qL1,11ABBB!I|B9$$ ]v	9a'C)TTF ]]2|Q//4466q9#y.IFM    matrixc                 8   | j         \  }}t          j        |dz   |dz   ft          j                  t          j        z  }t          j        |dz   |dz   ft          j                   }d|d<   t          d|dz             D ]}t          d|dz             D ]q}||dz
  |dz
  f         }||dz
  |f         }|||dz
  f         }	||k     r||	k     r|d}}
n||k     r||	k     r|d}}
n|	d}}
| |dz
  |dz
  f         |
z   |||f<   ||||f<   r|j         d         dz
  }|j         d         dz
  }d|dddf<   d|dddf<   g }g }|dk    s|dk    r|                    |dz
             |                    |dz
             |||f         dk    r|dz  }|dz  }n>|||f         dk    r|dz  }n*|||f         dk    r|dz  }nt          d| d| d	          |dk    |dk    t          j        |          ddd
         }t          j        |          ddd
         }||fS )z
    Measures similarity between two temporal sequences: the input audio and the output tokens. Used to generate
    token-level timestamps.
    r   dtyper   )r   r   r   Nz9Internal error in dynamic time warping. Unexpected trace[, z]. Please file a bug report.r   )	r"   nponesfloat32infrangeappendRuntimeErrorarray)r,   output_lengthinput_lengthcosttracejic0c1c2cttext_indicestime_indicess                 r)   _dynamic_time_warpingrF   B   s   
 #),M<7MA%|a'78
KKKbfTDWma')9:"*MMMMEDJ1lQ&''  q-!+,, 	 	Aa!eQUl#Ba!eQhBaQhBBww27711bR"WW1111Aq1u-1DAJE!Q$KK	  	AAAAE!QQQ$KE!!!Q$KLL
a%%1q55AE"""AE"""A;!FAFAA1a4[AFAA1a4[AFAApAppQRppp   a%%1q55 8L))$$B$/L8L))$$B$/L%%r+   c                 j    | /t          fd| D             d           }|rt          ||d           S d S )Nc              3   <   K   | ]}t          |          |V  d S N)
isinstance).0clslogit_processor_classs     r)   	<genexpr>z2_get_attr_from_logit_processors.<locals>.<genexpr>z   s3      jj:cShCiCijjjjjjjr+   )nextgetattr)logits_processorrM   attribute_namelogit_processors    `  r)   _get_attr_from_logit_processorsrT   x   sO    #jjjj/?jjjlpqq 	B?NDAAA4r+   rightlongestc                 j   d}g }|dvrt          d|           |dvrt          d|           |dk    r|t          d          | D ]}	|	t          d |	D                       dk    r~t          j        d	 |	D             d
          }
||
| d          }
|t          j        ||
g          }
|                    |
           t          |t          |d
                             }||                    |           |                    t          j        g |                     |dk    r|dz   n|}t          t          |                     D ]H}|t          ||                   z
  }|dk    rd|fn|df}t          j	        ||         ||          ||<   It          j
        |d          }|S )Nr   )rU   leftz5`padding_side` must be either 'right' or 'left', not )rV   
max_lengthz8`padding` must be either 'longest' or 'max_length', not rY   z>`cut_off_length` must be specified when `padding='max_length'`c                     g | ]
}|d          S tokens rK   ds     r)   
<listcomp>z&_pad_to_max_length.<locals>.<listcomp>   s    4_4_4_QQx[4_4_4_r+   c                     g | ]
}|d          S r[   r]   r^   s     r)   r`   z&_pad_to_max_length.<locals>.<listcomp>   s    !L!L!L!!H+!L!L!Lr+   r   dimdevicer   rU   )r$   value)r!   lentorchcatr6   maxtensorr5   Fr$   stack)current_segmentspad_token_idre   padding_sidepaddingbos_token_tensorcut_off_lengthmax_total_length	sequencescurrent_segment_listsequencer>   
pad_lengthr$   s                 r)   _pad_to_max_lengthry      s'    I,,,_Q]__```///]T[]]^^^	L	 	 ^%;YZZZ 0 > >+4_4_J^4_4_4_0`0`cd0d0dy!L!L7K!L!L!LRTUUUH)#^O$4$45+ 9&6%ABBX&&&"#3S25G5GHH)-....U\"V<<<====-4-D-D~))JZ3'(()) H H%IaL(9(99
!-!8!8q*ooz1ouYq\s,GGG	!I1---Ir+   c            0       d    e Zd Z	 d=dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d>deej                 dee         dee         dee	         d	ee
eej        gee         f                  d
edee         dee         deeeee         f                  dee         deej                 dee         dee         deeeeedf         f                  dee         dee         dee         dee         deej                 dedee         dedee         f.dZ fdZed             Zd  Zd! Zd" Zd# Zed$             Zed%             Zed&             Zed'             Zd( Zed)             Zd* Z 	 	 	 	 d?deej!                 d,eeej!        e"f                  dee         ded-ej        f
d.Z#ed/             Z$ed0             Z%ed1             Z&ed2             Z'ed3             Z(ed4             Z)d5 Z*ed6             Z+ed7             Z,ed8             Z-d9 Z.ed:             Z/ed;             Z0ed<             Z1 xZ2S )@WhisperGenerationMixin{Gz?Nc                 @
   g t          | j        j                  D ]<                    t	          j        fd|j        D             d                     =t	          j        fd|D                                           g d          d}d|v r|j	        dk    
                    d                                          }||n||z   }t	          j        |j	        ddd|f                   |j	        ddd||z
  f         dd|df<   ddddd|f                             dk    d	          t	          j        fd
t          j        d                   D             d          |pd	         j        d         }|j        j        d	         }t	          j        ||dz   ft          j        |j        j                  }	|Kt'          |t(                    rdd|dz  f         n%t'          |t*          t,          t.          j        f          r;t3          t/          j        |                    dk    rdd|d	         dz  f         nt'          |t          j                  r;t3          t	          j        |                    dk    rdd|d	         dz  f         nnt'          |t(                    r|n|t3          |          z  }
t'          |t          j                  r|                                n|}t/          j        ||
          }|t'          |t(                    rgt	          j        ddd          }t	          j        dd          }|z
  |z  tA          | j        j!                                      d          t          |          D ]m}|t'          |t,          t*          t.          j        t          j        f          r~|dd||         dz  f         }t	          j        |ddd          }t	          j        |dd          }||z
  |z  }tA          || j        j!                  }|                    d	          }n|         }tE          |                                #                                $                                           \  }}t/          j%        t/          j&        |          dd          '                    tP                    }||         |z  }t	          j)        |          |	|ddf<   o|	S )a  
        Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
        map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
        cross-attentions will be cropped before applying DTW.

        Returns:
            tensor containing the timestamps in seconds for each predicted token
        c                      g | ]
}|         S r]   r]   )rK   xr>   s     r)   r`   zDWhisperGenerationMixin._extract_token_timestamps.<locals>.<listcomp>   s    ._._._qt._._._r+   r   rb   c                 :    g | ]\  }}|         d d |f         S rI   r]   )rK   lhcross_attentionss      r)   r`   zDWhisperGenerationMixin._extract_token_timestamps.<locals>.<listcomp>   s0    UUUTQ/2111a48UUUr+   )r   r   r   r   Nbeam_indicesr   r   c           
      r    g | ]3}t          j        d d d d |d d f         dd d |f                   4S )Nr   )rc   index)rh   index_select)rK   r>   r   weightss     r)   r`   zDWhisperGenerationMixin._extract_token_timestamps.<locals>.<listcomp>   sf        &wqqq!!!Qz':VWVWVWYZVZI[\\\  r+   r   r/   re   .TF)rc   keepdimunbiased)rc   r   )r   r   )constant_values)*r5   configdecoder_layersr6   rh   ri   r   rm   permuter   sumrj   
zeros_likemasked_fillr"   ru   zerosr3   re   rJ   intlisttupler1   ndarrayrg   uniqueTensorcpurepeatstdmeanr*   median_filter_widthrF   doublenumpyr$   diffastypeboolrk   )selfgenerate_outputsalignment_headstime_precision
num_framesnum_input_idsweight_lengthr:   
batch_size
timestampsrepeat_timer   r   	batch_idxr,   rD   rE   jumps
jump_timesr   r   r>   r   s                      @@@@r)   _extract_token_timestampsz0WhisperGenerationMixin._extract_token_timestamps   sf    t{122 	i 	iA##EI._._._._=M=^._._._ef$g$g$ghhhh +UUUU_UUUVV//,,,//--- .:b@EEbIIMMOOM-:-BMMXeHeM !+,<,I!!!^m^J[,\]]L.>.KAAAOnQ^anQnOnLn.oLMNN*+aaaN]N23G (33LB4FJJL k    "<#5a#899     G %D(8(;(A!(D%/5a8
[)*%-HXHbHi
 
 

 ! *c** @!#'8q'8"89Jubj(ABB 
@s29U_K`K`GaGaefGfGf!#';A!);';";<J77 @CZ@X@X<Y<Y]^<^<^!#';A!);';";< -7z3,G,GjjjZ[^_i[j[jMj1;J1W1WgZ^^---]g
Yz;??
J!<!<)GTEJJJC:g2t<<<D~,G$Wdk.MNNG llql))G z** 	A 	AI%*Z%rz[`[gAh*i*i% C1M:i3HA3M1M!MN iBuMMMz&b$??? 4-3.'0OPP  ++ +)>

@S@S@U@U@[@[@]@]?])^)^&L,F27<00&!LLLSSTXYYE%e,~=J(-Z(@(@Jy!""}%%r+   Finput_featuresgeneration_configrQ   stopping_criteriaprefix_allowed_tokens_fnsynced_gpusreturn_timestampstasklanguageis_multilingual
prompt_idsprompt_condition_typecondition_on_prev_tokenstemperature.compression_ratio_thresholdlogprob_thresholdno_speech_thresholdnum_segment_framesattention_maskr   return_token_timestampsreturn_segmentsreturn_dict_in_generatec                   ;< d|v r/|                     d          }t          j        dt                      | j        |fi |\  }}| j        j        j        j        d         | j        j        j	        j        d         z  }|| j
        j        z  }|                     |||          \  }}||k    }|                     ||||          }|                     |||          }|                     |	||
|           |                     |||           |                     |||||	           |                     ||
           |                     |||| j
        ||          }|                     |           d|v r|d         d         j        n|j        }|j        d         } |                     ||| |                    dd          |          }|                     ||           t5          |t6          t8          f          s|gn|}!|!d         }|                     ||||          \  }"}#|j        <|                     ||#|"||||          \  }$}%}}#}"}}&|                      ||%|          }'|#|"k     !                                r| "                    ||#|"|%|$          \  }}%}$|#|z  |z  }(|"|#z
  #                    |          })| $                    ||#|)||%|$          }*tK          |tL          d          }+| '                    |%||'|$|&||| j
        |j        |+|          \  },}| (                    | j
        |,|           |5|D ]2}-tS          |-d          r |-*                    |,j        d                    3| +                    |*|,|%|$|#||"|!|||||||&||||          \  }.}/}0}&}1tY          |.          D ]\  }2}3|$|2         }4|0|2         r|#|4xx         |)|4         z  cc<   ,| -                    |3|/|(||)|||4|2|
  
        \  }5}6|'|4xx         |5z  cc<   |r|#|4xx         |"|2         z  cc<   w|#|4xx         |6z  cc<   |#|"k     !                                ||j.        d k    rd! |'D             n|'}7t_          |7|j0        | j        d"#          }8|r|8|7d$S |rg|j1        F|j2        ?tg          j4        |8j        d         df|j5                  }9tg          j6        |8|9gd%          }8|r+i }:|8|:d&<   tg          j7        d' |/D             d%          |:d(<   n|8}:|r|j8        r| 9                    |/|1|8j        |          ;<dk    rtS          ;d)          rG;j:        @t9          ;<fd*tw          ty          ;j:                            D                       ;_:        tS          ;d+          rG;j=        @t9          ;<fd,tw          ty          ;j=                            D                       ;_=        |r|:d(         ;d(<   ;S |:S |8S )-aG<  
        Transcribes or translates log-mel input features to a sequence of auto-regressively generated token ids.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

        For an overview of generation strategies and code examples, check out the [following
        guide](./generation_strategies).

        </Tip>

        Parameters:
            input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*):
                Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by
                loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
                the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
                [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
                tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details.
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complement the default stopping criteria built from arguments and a
                generation config. If a stopping criteria is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://arxiv.org/abs/2010.00904).
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            return_timestamps (`bool`, *optional*):
                Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`.
            task (`str`, *optional*):
                Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids`
                will be updated accordingly.
            language (`str` or list of `str`, *optional*):
                Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. For
                batched generation, a list of language tokens can be passed. You can find all the possible language
                tokens in the `model.generation_config.lang_to_id` dictionary.
            is_multilingual (`bool`, *optional*):
                Whether or not the model is multilingual.
            prompt_ids (`torch.Tensor`, *optional*):
                Rank-1 tensor of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is
                provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
                transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
                correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
            prompt_condition_type (`str`, *optional*):
                Only relevant for long-form transcription. Condition type of `prompt_ids`. 'first-segment' means only the first segment is conditioned on `prompt_ids`. 'all-segments' means each segment is conditioned on `prompt_ids`. Make sure to enable `condition_on_prev_tokens` for 'all-segments'.
                Defaults to 'first-segment'. For short-term transcription only 'first-segment' is possible.
            condition_on_prev_tokens (`bool`, *optional*):
                Only relevant for long-form transcription. Whether to condition each segment on the previous segment.
                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
                performance.
            temperature (`float` or list of `float`, *optional*):
                The temperature to be used for generation. Passing a single `float` value and `do_sample=True` activates
                generation using sampling. For long-form transcription, temperature fallback can be activated by passing
                a list of float values such as (0.0, 0.2, 0.4, 0.6, 0.8, 1.0). As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
                performance.
            compression_ratio_threshold (`float`, *optional*):
                Only relevant for long-form transcription. If defined, the zlib compression rate of each segment will be computed. If the compression rate of
                a segment is higher than `compression_ratio_threshold`, temperature fallback is activated: the generated segment is discarded and the generation is
                repeated using a higher temperature. The intuition behind this feature is that segments with very high compression rates
                suffer from a lot of repetition. The unwanted repetition can be reduced by injecting more randomness by increasing the temperature. If `compression_ratio_threshold` is defined
                make sure that `temperature` is a list of values. A common value for `compression_ratio_threshold` is 1.35.
                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
                performance.
            logprob_threshold (`float`, *optional*):
                Only relevant for long-form transcription. If defined, the average log-probability of each segment will be computed. If the log-probability of
                a given segment is lower than `logprob_threshold`, temperature fallback is activated: the generated segment is discarded and the generation is
                repeated using a higher temperature. The intuition behind this feature is that segments of low log-probability
                can be improved by injecting more randomness by increasing the temperature. If `logprob_threshold` is defined
                make sure that `temperature` is a list of values. A common value for `logprob_threshold` is -1.0.
                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
                performance.
            no_speech_threshold (`float`, *optional*):
                Only relevant for long-form transcription. If defined, the "no-speech" token combined with the `logprob_threshold`
                is used to determine whether a segment contains only silence. In this case, the transcription for this segment
                is skipped.
                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
                performance.
            num_segment_frames (`int`, *optional*):
                The number of frames a single segment is made of. If not defined, `num_segment_frames` defaults to the model's stride
                times the maximum input length.
            attention_mask (`torch.Tensor`, *optional*):
                `attention_mask` needs to be passed when doing long-form transcription using a batch size > 1.
            time_precision (`int`, *optional*, defaults to 0.02):
                The duration of output token in seconds. *E.g.* 0.02 means that a generated token on average accounts
                for 20 ms.
            return_token_timestamps (`bool`, *optional*):
                Whether to return token-level timestamps with the text. This can be used with or without the
                `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
                words.
            return_segments (`bool`, *optional*, defaults to `False`):
                Whether to additionally return a list of all segments. Note that this option can only be enabled
                when doing long-form transcription.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of just returning the generated tokens.
                Note that when doing long-form transcription, `return_dict_in_generate` can only be enabled when
                `return_segments` is set True. In this case the generation outputs of each segment is added to each
                segment.
            kwargs (`Dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.

        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor` or `Dict[str, Any]`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor` or a dict of segments when `return_segments=True`.

                If the passed input is > 30 seconds / > 3000 mel input features and `return_segments=True` then a dictionary of generated sequence ids, called `sequences` and a list of each generated segment is returned.

                else if the passed input is <= 30 seconds / >= 3000 mel input features, the possible [`~utils.ModelOutput`] types are:

                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]

                else only the generated output sequence ids are returned.

        Example:

        - *Longform transcription*: To transcribe or translate audios longer than 30 seconds, process the audio files without truncation and pass all mel features at once to generate.

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
        >>> from datasets import load_dataset, Audio

        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
        >>> model.cuda()  # doctest: +IGNORE_RESULT

        >>> # load audios > 30 seconds
        >>> ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
        >>> # resample to 16kHz
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
        >>> # take first 8 audios and retrieve array
        >>> audio = ds[:8]["audio"]
        >>> audio = [x["array"] for x in audio]

        >>> # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
        >>> inputs = processor(audio, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True, sampling_rate=16_000)
        >>> inputs = inputs.to("cuda", torch.float32)

        >>> # transcribe audio to ids
        >>> generated_ids = model.generate(**inputs)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
        >>> transcription[0]
        " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile."
        ```

        - *Shortform transcription*: If passed mel input features are < 30 seconds, the whole audio will be transcribed with a single call to generate.

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_features = inputs.input_features

        >>> generated_ids = model.generate(inputs=input_features)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```

        r   zXThe input name `inputs` is deprecated. Please make sure to use `input_features` instead.r   )r   input_stridekwargsr   r   r   r   )r   is_shortformr   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   )r   r   r   r   r   )r   encoder_outputsr   	num_beams)r   rQ   begin_indexr   re   r   r   )r   r   total_input_framesr   )r   seek
max_framesinit_tokensr   r   r   )r   r   r   )r   r   r   cur_bszbatch_idx_map)rj   )r   r   seek_num_framesr   r   r   suppress_tokens)r   r   rn   r   do_condition_on_prev_tokensr   r   r   re   r   r   )r   decoder_input_idsr   Nset_begin_indexr   )segment_inputr   r   r   r   r   r   temperaturesr   rQ   r   r   r   r   r   r   r   r   r   )
seek_sequenceseek_outputstime_offsettimestamp_beginr   r   r   prev_idxidxr   first-segmentc                 "    g | ]}|d d         S )r   Nr]   )rK   r   s     r)   r`   z3WhisperGenerationMixin.generate.<locals>.<listcomp>  s     ---qQqrrU---r+   rU   )re   rp   )ru   segmentsrb   ru   c                     g | ]
}|d          S )token_timestampsr]   r^   s     r)   r`   z3WhisperGenerationMixin.generate.<locals>.<listcomp>  s    :g:g:gUV1=O;P:g:g:gr+   r   encoder_attentionsc              3   D   K   | ]}j         |         d d          V  d S rI   )r   rK   r>   dict_outputsnum_return_sequencess     r)   rN   z2WhisperGenerationMixin.generate.<locals>.<genexpr>   sV       @ @ ! );A>?U?UAU?UV@ @ @ @ @ @r+   encoder_hidden_statesc              3   D   K   | ]}j         |         d d          V  d S rI   )r   r   s     r)   rN   z2WhisperGenerationMixin.generate.<locals>.<genexpr>  sV       C C ! )>qABXBXDXBXYC C C C C Cr+   )>popwarningswarnFutureWarning_prepare_generation_configmodelencoderconv1strideconv2r   max_source_positions_retrieve_total_input_frames_set_return_outputs_set_return_timestamps_set_language_and_task_set_num_frames_set_thresholds_and_condition_set_prompt_condition_type_retrieve_init_tokens_check_decoder_input_idsre   r"   _retrieve_logit_processorsget_set_condition_on_prev_tokensrJ   r   r   _retrieve_max_frames_and_seekr    _expand_variables_for_generation_prepare_segmentsany_maybe_reduce_batchclamp_get_input_segmentrT   r   _prepare_decoder_input_ids_set_max_new_tokens_and_lengthhasattrr   generate_with_fallback	enumerate_retrieve_segmentr   ry   ro   max_new_tokensrY   rh   fulleos_token_idri   rm   r   _stack_split_outputsr   r5   rg   r   )=r   r   r   rQ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   re   r   r   r   r   r   r   r   rn   r   r   r   r   r   procseek_sequencesr   should_skipmodel_output_typer>   r   prev_ir   segment_offsetfinal_segmentsru   
eos_tokensoutputsr   r   s=                                                              @@r)   generatezWhisperGenerationMixin.generate!  s   t v#ZZ11NMj   %DD$CDU$`$`Y_$`$`!6 z)/6q9DJ<N<T<[\]<^^)DK,LL)-)J)J)V *K *
 *
&
& *-?? #'":":$;$;//	 #; #
 #
 55/l^o 6 
 
 	##D/]n 	$ 	
 	
 	
 	$;O`io 	 	
 	
 	
 	**//(C 3%= 	+ 	
 	
 	
 	''/"7 	( 	
 	
 	
 00!/;1 1 
 
 	%%V%444 9JV8S8S)*1-44YgYn!'*::/-#jja00 ; 
 
 	**%=Qb 	+ 	
 	
 	
 -7{T5M,R,Rc}}Xc"1o==!)1%	 > 
 

D  1E 11)!#!%=/ 2 
 
	
'  11!/ 2 
 
 j %%'' p	3
 6:5M5M-%+ 6N 6 62NG] /,>K)D077<N7OOO !33- /#5+ 4  M > "?AR O )-(G(G'!1+,G%"3{") / )H ) )%v //{"3"3 0     +, J JDt%677 J,,->-DR-HIII +++"3+#5%)"3!1"3)A'(?,G)%-' ,  +!0 %.n$=$= 3 3 =&q)q> LLLOF$;;LLL+/+A+A"/!- +$3$3#1!-#,C ,B , ,(. !(((H4((( 3LLLJqM1LLLLLLLN2LLLLa j %%'' p	3l &+<+RVe+e+e .-,----! 	 '-:4;]d
 
 
	
  	H!*GGG "	 /7<M<X<`"Z);Q(?ARA_``
!Iy*&=2FFF	& $'0$.3k:g:gZf:g:g:gmn.o.o.o*++#& $+<+T $#88GXZcZjlrss'!++|-ABB |GfGr:? @ @ @ @ @%*3|/N+O+O%P%P@ @ @ ; ;7
  .EFF(>J=B C C C C C%*3|/Q+R+R%S%SC C C > >: + S7>?Q7RL!34##Nr+   c                 4	   t          j         |          }d t          |          D             }d t          |          D             }d t          |          D             }d t          |          D             }t          t          |                    }|	j        |                     |
|||           t          |          D ]\  }}|d uo|dk    |	_        |	j        r|nd|	_        |	j        rd|	_        t          j         |          }dD ]	}||v r||= 
|j	        d	         }|	j
        d
k    r||k     rt          j        |d	d	d	d	d	||z
  fd	          }t          j        |d	d	d	||z
  f|	j                  }|                    d          't          j        |d         d	d	d	||z
  fd          |d<   |                    d          )t          j        |d         d	d	d	d	d	||z
  fd	          |d<    t                      j        |f|	|
|||||d|}t#          |          }|                     ||||	|          \  }}||k     r|d |         }|d |         }g } g }!g }"g }#t          |          D ]\  }$}%|||$                  }&||&         |z   ||&         k     }'|'r>|%d         |	j        k    r-|%d d         }%|r!|s||$         d         d d         ||$         d<   |%d         |	j        k    rL|%|	j        k                                    }(|%d |(          }%|r"|s ||$         d         d |(          ||$         d<   |                     |%||$|
|	| j        j        |          \  ||$<   ||$<   |%|||$         <   ||$         |||$         <   |d u p|dk     })|	j        o|)|||$         <   ||$         rv|                     ||$                    |!                    ||$                    |"                    ||$                    d|v r!|#                    |d         |$                    | }t5          |          d	k    s|t5          |          dz
  k    r|}|} nEt7          j        |"          }t7          j        |!          }d|v rt7          j        |#          |d<   |||||fS )Nc                     g | ]}d S rI   r]   rK   _s     r)   r`   zAWhisperGenerationMixin.generate_with_fallback.<locals>.<listcomp>-  s    ;;;qd;;;r+   c                     g | ]}d S rI   r]   r'  s     r)   r`   zAWhisperGenerationMixin.generate_with_fallback.<locals>.<listcomp>.  s    :::aT:::r+   c                     g | ]}d S Fr]   r'  s     r)   r`   zAWhisperGenerationMixin.generate_with_fallback.<locals>.<listcomp>/  s    888A%888r+   c                     g | ]}d S r+  r]   r'  s     r)   r`   zAWhisperGenerationMixin.generate_with_fallback.<locals>.<listcomp>0  s    555u555r+           g      ?r   )	do_sampler   r   r   static)rf   decoder_attention_maskTr   )r   rQ   r   r   r   r   r   )r   r   r   r   r   r   r   g      ?)copyr5   r   r   _setup_no_speech_detectionr  r.  r   r   r"   cache_implementationrl   r$   ro   r  superr$  type_postprocess_outputsr  r   _need_fallbackr   
vocab_sizer   r6   rg   rh   rm   )+r   r   r   r   r   r   r   r   r   r   rQ   r   r   r   r   r   r   r   r   r   seek_sequence_listseek_outputs_listneeds_fallbackr  fallback_index_mapfallback_idxr   generate_kwargskeyr   r  r  new_fallback_index_mapnew_segment_inputnew_decoder_input_idsnew_decoder_attention_maskr>   r   r  is_not_finalnum_paddingsis_low_temperature	__class__s+                                             r)   r  z-WhisperGenerationMixin.generate_with_fallback  s   , 6"" <;E'NN;;;::5>>:::88w88855eGnn555!%..110<++,<mM^`fggg)2<)@)@ s	[ s	[%L+*5T*A*WkTWFW';L;V,_KK\_) * 0./!+"i//O@ - -/))','-a0G 5AAgPZFZFZ !maAq!ZRYEY5Zbc d d d$%E%1ag1E'FN_Nl% % %! #&&'?@@L@A'(@AAq!ZZaMaCbjnA A AO$<= #&&'899E9:'(9:Q1aJY`L`<aij: : :O$56 ,577+
"3!1"3)A'"3-
 
 "
 
L !%\ 2 2 ,0+D+D)"3(?"3) ,E , ,(NL ##!/!9+HWH5 &(" "$&!)+&$-n$=$= )_ )_ =&'9!'<= $V/A AZPVEWW   gM"$59J9W$W$W$1#2#$6M. g| g>J1oN`>abecebe>fQ(:; !$(9(FFF$15F5S$S#X#X#Z#ZL$1.L=.$AM. r| r>J1oN`>abpdpcpbp>qQ(:; 594G4G! $%K*5 51q!;q> =J"#5a#89;G?!"4Q"78%0D%8%MK#<M"%>UCU ,,>q,AB "!$ _*112DQ2GHHH%,,]1-=>>>)001B11EFFF/699299&AY:Z[\:]^^^!7 %&&!++|s<?P?PST?T/T/T!30 !&,A B B!K(9::M'61138;?Y3Z3Z/0|[:UWhhhr+   c                       P|j         dk    rEt          |dd           } d         |k    r
 dd          n   fdt          |          D             }nd t          |          D             }|S )Nr   prev_sot_token_idr   r   c                     g | ]}d igS r[   r]   )rK   r(  r   s     r)   r`   z<WhisperGenerationMixin._prepare_segments.<locals>.<listcomp>  s!    TTTQ(J!7 8TTTr+   c                     g | ]}g S r]   r]   r'  s     r)   r`   z<WhisperGenerationMixin._prepare_segments.<locals>.<listcomp>  s    >>>q>>>r+   )r   rP   r5   )r   r   r   rI  rn   s   `    r)   r  z(WhisperGenerationMixin._prepare_segments  s    !&7&MQ`&`&` '(9;NPT U U+5a=<M+M+MABBS]JTTTT%
BSBSTTT>>E*,=,=>>>r+   c                    	 s|j         d         nt          j        d          }t          t          j                  rd d |d f         fS |rdt          |d          rTt          |dd           }                     |j        ||j         d                   d<   d         d d |d f         d<   d         d d |d f         d<   d
 fd		d         }	fd	t          |j         d                   D             |fS )Nr   r   r   r   )r   r   r   ru   c           	      $   |;|dk    r5d t          | |         d t          |                              D             S |dv rfd| D             S |dv rt          fd| D                       S |dk    r|sd S t          | t                    rg }t          j        j                  D ]}g }| j        | j	        fD ]L}|j
        |j        fD ];}	|                    |	|                  d                                                     <M|                    t          |                     t          |          S g }t          t          |                     D ]}	g }| |	         D ]^}
t          |
          dk    r4|                    |
         d                                                     I|                    |
           _|                    t          |                     t          |          S |                                          S )	Nscoresc                 H    g | ]\  }}||                                           S r]   r   )rK   vbeam_idxs      r)   r`   z]WhisperGenerationMixin._postprocess_outputs.<locals>.split_by_batch_index.<locals>.<listcomp>  s*    sssmq(())sssr+   rN  r   r   logitsc                 D    g | ]}|                                          S r]   rP  rK   rQ  r   s     r)   r`   z]WhisperGenerationMixin._postprocess_outputs.<locals>.split_by_batch_index.<locals>.<listcomp>  s)    ;;;q)((**;;;r+   decoder_attentionsdecoder_hidden_statesr   c              3   N   K   | ]}t          fd |D                       V   dS )c              3   X   K   | ]$}|         d                                           V  %d S rI   rP  )rK   wr   s     r)   rN   zfWhisperGenerationMixin._postprocess_outputs.<locals>.split_by_batch_index.<locals>.<genexpr>.<genexpr>  s8      "G"G1Y<#5#9#9#;#;"G"G"G"G"G"Gr+   N)r   rV  s     r)   rN   z\WhisperGenerationMixin._postprocess_outputs.<locals>.split_by_batch_index.<locals>.<genexpr>  s@      XXAU"G"G"G"GQ"G"G"GGGXXXXXXr+   past_key_valuesr   )ziprg   r   rJ   r
   r5   r   r   self_attention_cachecross_attention_cache	key_cachevalue_cacher6   r   )valuesr?  r   r   r   all_past_key_values	layer_idxlayer_past_key_values	cache_clsrQ  r\  r   s     `        r)   split_by_batch_indexzIWhisperGenerationMixin._postprocess_outputs.<locals>.split_by_batch_index  sn   'C8OOsss6<XaKbcpehioepepcpKq?r?rssssYYY;;;;F;;;;YYYXXXXQWXXXXXX)))# 64(;<< 6*,'%*4;+E%F%F Q Q	02-*0*EvGc)d b bI&/&99;P%Q b b 5 < <Qy\)=TUY=Z=^=^=`=` a a a ab+2259N3O3OPPPP !4555*,'"3v;;// Q Q02-!' @ @A"1vv{{ 5 < <Qy\$=O=S=S=U=U V V V V 5 < <Q ? ? ? ?+2259N3O3OPPPP !4555)$((***r+   c                 X    g | ]%fd                                  D             &S )c                 ^    i | ])\  }}| ||                     d                     *S )r   )r   )r  )rK   krQ  r>   r   r   rh  s      r)   
<dictcomp>zJWhisperGenerationMixin._postprocess_outputs.<locals>.<listcomp>.<dictcomp>  sU       Aq ''1aLL\L\]kLlLlmmm  r+   )items)rK   r>   r   r   rh  s    @r)   r`   z?WhisperGenerationMixin._postprocess_outputs.<locals>.<listcomp>  sh     
 
 

 	      (..00  
 
 
r+   rI   )
r"   rh   rk   rJ   r   r  rP   r   r   r5   )
r   r   r   r   r   r   	start_idxr   sequence_tokensrh  s
   ``   `   @r)   r6  z+WhisperGenerationMixin._postprocess_outputs  s    8DX%+B//VW	lEL11 	.'9::6L--" 	_w/@BS'T'T 	_ !2L$GGJ/3/M/M!1%/5b9	 0N 0 0L+, 0<<N/OPQPQPQS\S]S]P]/^L+,$0$=aaam$L[! 	+  	+  	+  	+  	+  	+D '{3
 
 
 
 
 

 ?0344
 
 
 ,,r+   c           
         i }d                                          D ]dv r9t          j        fdD             d                                        |<   @dv rGt	          fdt          t          d                                      D                       |<   dk    r9t          j        fdD             d                                        |<   ʉd	v rHt	          fd
t          t          d                                      D                       |<   dk    r|                    d          }d                  }t	          fdt          t          d                                      D                       |<   |3t          |t                    r|
                    |                   |<   d |<    |di |S )Nr   )ru   r   c                      g | ]
}|         S r]   r]   rK   rQ  r?  s     r)   r`   z?WhisperGenerationMixin._stack_split_outputs.<locals>.<listcomp>      +I+I+IqAcF+I+I+Ir+   rb   rS  c              3      K   | ]8t          j        fd D                                                     V  9dS )c                 ,    g | ]}|                  S r]   r]   )rK   rQ  r>   r?  s     r)   r`   zIWhisperGenerationMixin._stack_split_outputs.<locals>.<genexpr>.<listcomp>  s!     A A Aq3 A A Ar+   N)rh   rm   torK   r>   re   r?  r   s    @r)   rN   z>WhisperGenerationMixin._stack_split_outputs.<locals>.<genexpr>  se       % %RSEK A A A A AL A A ABBEEfMM% % % % % %r+   sequences_scoresc                      g | ]
}|         S r]   r]   rr  s     r)   r`   z?WhisperGenerationMixin._stack_split_outputs.<locals>.<listcomp>  rs  r+   rW  c           
   3      K   | ]Nt          fd t          t          d                  d                             D                       V  OdS )c              3      K   | ]Lt          j        fd D                                           d                                        V  MdS )c                 8    g | ]}|                           S r]   r]   rK   rQ  r>   r=   r?  s     r)   r`   zSWhisperGenerationMixin._stack_split_outputs.<locals>.<genexpr>.<genexpr>.<listcomp>	  s%    $H$H$HaQsVAYq\$H$H$Hr+   r   Nrh   rm   squeezerv  rK   r=   re   r>   r?  r   s    @r)   rN   zHWhisperGenerationMixin._stack_split_outputs.<locals>.<genexpr>.<genexpr>  s{         $H$H$H$H$H$H<$H$H$HIIQQRSTTWWX^__     r+   r   Nr   r5   rg   rw  s    @r)   rN   z>WhisperGenerationMixin._stack_split_outputs.<locals>.<genexpr>  s       % %
 	        !&s<?3+?+B'C'C!D!D    % % % % % %r+   r]  c           
   3      K   | ]Nt          fd t          t          d                  d                             D                       V  OdS )c              3      K   | ]Lt          j        fd D                                           d                                        V  MdS )c                 8    g | ]}|                           S r]   r]   r}  s     r)   r`   zSWhisperGenerationMixin._stack_split_outputs.<locals>.<genexpr>.<genexpr>.<listcomp>  s%    (L(L(L!31(L(L(Lr+   r   Nr~  r  s    @r)   rN   zHWhisperGenerationMixin._stack_split_outputs.<locals>.<genexpr>.<genexpr>  s{         ! "K(L(L(L(L(L(L|(L(L(LMMUUVWXX[[\bcc     r+   r   Nr  rw  s    @r)   rN   z>WhisperGenerationMixin._stack_split_outputs.<locals>.<genexpr>  s       ) )
 	        %*3|As/CA/F+G+G%H%H    ) ) ) ) ) )r+   r]   )keysrh   rm   rv  r   r5   rg   r  rJ   r
   from_legacy_cache)r   r   r  re   r   r#  past_key_value_typer?  s    ` `   @r)   r  z+WhisperGenerationMixin._stack_split_outputs  s   ?'')) 	( 	(C333${+I+I+I+IL+I+I+IqQQQTTU[\\[[[$ % % % % % %W\]`amnoapqtau]v]vWwWw% % %     ***${+I+I+I+IL+I+I+IqQQQTTU[\\[[[$ % % % % % %
 #3|As';#<#<==% % %     )))&,jj1B&C&C#?3'3#( ) ) ) ) ) )
 "'s<?3+?'@'@!A!A) ) ) $ $GCL +6:FY[n;o;o6':'L'LWUX\'Z'Z#'GCL  ++7+++r+   c                    d}d}	|j         #|                     ||          }
|
|j         k    rd}|j        at          |d         d          rd |D             |         }n+||         d         }|                     |||j        |          }||j        k     rd}|j        6t          |t          d          }||j        k     r||         |j        k    rd}d}	||	fS )NFTr   rx  c                     g | ]
}|d          S )rx  r]   )rK   ss     r)   r`   z9WhisperGenerationMixin._need_fallback.<locals>.<listcomp>3  s    HHHaA01HHHr+   rN  no_speech_prob)	r   _retrieve_compression_ratior   r  _retrieve_avg_logprobsr  r   rT   r   )r   r   r   r   rQ   r   r8  r   r;  r  compression_ratiologprobsrN  r  s                 r)   r7  z%WhisperGenerationMixin._need_fallback  s    8D $ @ @PZ [ [ #4#PPP!%.:|A(:;; HH<HHHO%e,X666M+<+I;  +===!%0<< ":<L N
 ,>>>"5),=,QQQ!&"{**r+   c                 R   |j         |j         dk    rt          t          ||j         z                      }t          |          }	fdt          t          |                    D             }
|                    |j         d          }|                    |j         d          }|                    |j         d          }|                    |j         d          }d|_         n9|}	t          t          |	                    }fdt          |	          D             }
||	|||||
fS )Nr   c                     g | ]}S r]   r]   rK   r(  r   s     r)   r`   zKWhisperGenerationMixin._expand_variables_for_generation.<locals>.<listcomp>Q  s    *g*g*g+C*g*g*gr+   r   rb   c                     g | ]}S r]   r]   r  s     r)   r`   zKWhisperGenerationMixin._expand_variables_for_generation.<locals>.<listcomp>Z  s    *\*\*\+C*\*\*\r+   )r   r   r5   rg   repeat_interleave)r   r   r   r   r   r   r   r   r   r   r   s         `    r)   r  z7WhisperGenerationMixin._expand_variables_for_generationK  sN    1=BSBhklBlBl z4E4Z'Z![![\\M-((G*g*g*g*gUSVWdSeSeMfMf*g*g*g'+==>O>djk=llN))*;*PVW)XXD#556G6\bc5ddJ%778I8^de7ffK5622 G w00M*\*\*\*\USZ^^*\*\*\' '
 	
r+   c                     t          | t          d          }d |                                D             } |||d|           d S )N
set_inputsc                 B    i | ]\  }}t          j        |          ||S r]   )rh   	is_tensor)rK   rk  rQ  s      r)   rl  zEWhisperGenerationMixin._setup_no_speech_detection.<locals>.<dictcomp>i  s-    NNNA5?1;M;MN1NNNr+   )r   r   )rT   r   rm  )rQ   r   r   r   r  extra_kwargss         r)   r2  z1WhisperGenerationMixin._setup_no_speech_detectionf  sV    45EG_amnn
NNNNN
mBSddWcdeeeeer+   c                     | | j         d         | j         d         fS d|v rNt          |d         t                    r|d         d         j         n|d         j         }|d         |d         |z  fS t          d          )Nr   r   r   r   zPMake sure to provide either `input_features` or `encoder_outputs` to `generate`.)r"   rJ   r   r!   )r   r   r   encoder_outputs_shapes       r)   r   z3WhisperGenerationMixin._retrieve_total_input_framesl  s    %!'*N,@,DDD&& f%67II5()!,22-.4 "
 )+-B1-E-TTTklllr+   c                    d| d}| 0t                               |                    d|                       |0t                               |                    d|                      |0t                               |                    d|                      |0t                               |                    d|                      t          |t          t
          f          rt          d| d| d          d S )	NzAudio input consists of only z@. Short-form transcription is activated.{}, but will be ignored.z#condition_on_prev_tokens is set to z&compression_ratio_threshold is set to zlogprob_threshold is set to zno_speech_threshold is set to zE. Short-form transcription is activated.temperature cannot be set to z which can only be used for temperature fallback for long-form generation. Make sure to set `temperature` to a float value or `None` for short-form generation.)loggerwarningformatrJ   r   r   r!   )r   r   r   r   r   r   warning_prefixs          r)   _maybe_warn_unused_inputsz0WhisperGenerationMixin._maybe_warn_unused_inputs{  sG   ',> ' ' ' 	
 $/NN>001qWo1q1qrrsss&2NN%%&lOj&l&lmm   (NN>001cPa1c1cddeee*NN>001gRe1g1ghhiii kD%=11 	]0B ] ]0;] ] ]  	 	r+   c                     | |j         } n| |_         ||_        |rd|_         d|_        d|_        |d|_         d|_        | S )NT)r   r   output_attentionsoutput_scoresr   s       r)   r   z*WhisperGenerationMixin._set_return_outputs  sg    "*&7&O##8O54K1" 	38<526/.2+(8<5.2+&&r+   c                 >   |t          |d          r|j        }|s/|du rt          d          t                              d           d}|rt          |d          st          d          ||_        t          |d          r|j        dz   }n| j        j        dz   }|S )	Nr   Fa  You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features.z:Setting `return_timestamps=True` for long-form generation.Tno_timestamps_token_idad  You are trying to return timestamps, but the generation config is not properly set. Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363r   )r  r   r!   r  infor  r   r8  )r   r   r   r   r   s        r)   r   z-WhisperGenerationMixin._set_return_timestamps  s    $1BDW)X)X$ 1 C 	% E)) m  
 KKTUUU $ 	W->@X%Y%Y 	g   /@+$&>?? 	9/FJOO #k4q8Or+   c                 N   |&t          |d          st          d          ||_        t          |d          r|j        s|| t          d          | &t          |d          st          d          | |_        |(t          |d          st          d          ||_        d S d S )Nr   zThe generation config is outdated and is thus not compatible with the `is_multilingual` argument to `generate`. Please update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224zCannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=True` to generate, or update the generation config.
lang_to_ida=  The generation config is outdated and is thus not compatible with the `language` argument to `generate`. Either set the language using the `forced_decoder_ids` in the model config, or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224
task_to_ida5  The generation config is outdated and is thus not compatible with the `task` argument to `generate`. Either set the task using the `forced_decoder_ids` in the model config, or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224)r  r!   r   r   r   r   s       r)   r  z-WhisperGenerationMixin._set_language_and_task  s
   &,.?@@  g  
 1@-$&788 	ARAb 	8#7 n  
 ,l;;  _  
 *2&,l;;  _  
 &*""" r+   c           	      
   dt           t                   dt          dt          t                   fd}dt          dt          ffdt	          dd           }t	          dd           }	j        }
|
-|	*|(|
d	         d
         t                              d           nt          |d          r|j        |j        }
|
)|'t                              d| d|
 d| d           d }
n*|
(|	&t                              d|	 d|
 d|	 d           d }
j	        g|
|
d	         d	         d
k    rd
t          |
          d	k    rX|
d	         d	         k    rF|
d	         d
         gz  |
d
d          }
d
z  t          |
          d	k    r|
d	         d	         k    Ft          |
          d	k    r"t          d|
 d|
d	         d	          d          d _        t                    d
k    pt                    d
k    o	d
         d u }t          |	t          t          f          rat          d |	D                       rt!          d          t          |	          |k    r#t          d| dt          |	           d          |	}n|	d g|z  }n|	g}fd|D             d }|	fd|D             }nQt          d          rA|r?|                     ||                    dd           |                                          }|kt)          t                              D ]Nt                             d
k    r|                  d
<   -                             |                    O~t)          t                              D ]||t,          v rh                             j        j                            j        j                 } |         |j                                                   nt          d| d t,           d!          |	it          d"          rYt          fd#j                                        D                       s&                             j        d$                    j        sHt          d%          r8         d&         j        k    r!                             j                   nKj        rD         d&         j        k    r-t                              d'                    d d&         <   d(          D             <   t;          j        t:          j        | j         )          !                    |d&          S )*Nlstnumitrc                      t           fdD                       }|rfd D              n                                 S )z/short function to replace num with a itr in lstc              3       K   | ]}|v V  	d S rI   r]   )rK   r>   r  s     r)   rN   zWWhisperGenerationMixin._retrieve_init_tokens.<locals>.replace_or_add.<locals>.<genexpr>  s'      ..QS......r+   c                      g | ]
}|v rn|S r]   r]   )rK   r>   r  r  s     r)   r`   zXWhisperGenerationMixin._retrieve_init_tokens.<locals>.replace_or_add.<locals>.<listcomp>  s%    ;;;!a3hhssA;;;r+   )r  r6   )r  r  r  founds   ``` r)   replace_or_addzDWhisperGenerationMixin._retrieve_init_tokens.<locals>.replace_or_add  sa    ....#.....E  ;;;;;s;;;

3Jr+   r   r   c           	         |                                  } | j                                        v r| }n| t          j                    v rdt          |           d}n| t          j                    v rd|  d}nit          |           dk    }t          d|  d|r t          t          j                              nt          t          j                               d          |j        vrt          | d          j        |         S )Nz<|z|>r   zUnsupported language: z. Language should be one of: .z is not supported by this specific model as it is not in the `generation_config.lang_to_id`.(You should just add it to the generation config))lowerr  r  r   rc  rg   r!   r   )r   language_tokenis_language_coder   s      r)   language_to_idzDWhisperGenerationMixin._retrieve_init_tokens.<locals>.language_to_id  sK   ~~''H,7<<>>>>!)-24444!D&6x&@!D!D!D-46666!2h!2!2!2#&x==A#5  rX r r;Kn-466777QUVfVkVmVmQnQnr r r   %6%AAA % H H H  
 %/??r+   r   r   r   ai  Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.forced_decoder_idszYou have passed task=z,, but also have set `forced_decoder_ids` to zQ which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=r  zYou have passed language=zU which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=zYou are using token ids in `forced_decoder_ids` that do not seem to correctly follow the prompt pattern of Whisper. Make sure that z) has an entry for all indices >= 1 and < c              3      K   | ]}|d u V  	d S rI   r]   )rK   r   s     r)   rN   z?WhisperGenerationMixin._retrieve_init_tokens.<locals>.<genexpr>A  s&      //19//////r+   zExpected `language` to be `None`, a single string (e.g. `'en'`), or a list of strings with length equal to the batch size (e.g. `('en', 'fr')` for a batch size of 2). Got a list containing `None`.zgWhen passing a list of languages, the length of the list must match the batch size. Expected length of z
, but got z languages.c                 8    g | ]}t          j                   S r]   )r1  )rK   r(  r   s     r)   r`   z@WhisperGenerationMixin._retrieve_init_tokens.<locals>.<listcomp>R  s#    AAA!ty--AAAr+   c                 &    g | ]} |          S r]   r]   )rK   r   r  s     r)   r`   z@WhisperGenerationMixin._retrieve_init_tokens.<locals>.<listcomp>W  s#    ===aq))===r+   r  r   )r   r   r   r   zThe `z3`task is not supported. The task should be one of ``r  c              3   ,   K   | ]}|         v V  d S rI   r]   )rK   tir>   r   s     r)   rN   z?WhisperGenerationMixin._retrieve_init_tokens.<locals>.<genexpr>v  s,      ``B2Q/``````r+   
transcriber  r   zm<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `'True'`.c                     g | ]}||S rI   r]   )rK   rC   s     r)   r`   z@WhisperGenerationMixin._retrieve_init_tokens.<locals>.<listcomp>  s    IIIA1=a===r+   r   )"r   r   r   strrP   r  r  warning_oncer  decoder_start_token_idrg   r!   rJ   r   r   r  	TypeErrordetect_languager  tolistr5   r6   r   r  r   rc  r   r  r  rh   	as_tensorlongre   expand)r   r   r   r   r   r   r   r  r   r   r  is_lang_id_undefined	languageslang_idstask_idr>   r   r  s      `           @@@r)   r  z,WhisperGenerationMixin._retrieve_init_tokens  s>   	S	 	 	(3- 	 	 	 		@S 	@S 	@ 	@ 	@ 	@ 	@ 	@, (&$77,j$??.A)DL5G5J15M5U##k   V122 	;v7P7\!'!:)d.> G  G  GZl  G  G  @D  G  G  G   "&+0D WH  W  Wbt  W  W  LT  W  W  W   "&(?@).@.CA.F!.K.KA())A--2DQ2G2Ja2O2O 21 5a 899%7%;"Q ())A--2DQ2G2Ja2O2O
 %&&**  s  [m  s  s  Xj  kl  Xm  no  Xp  s  s  s  
 04,";//14i[9I9IA9M9hR]^_R`dhRh hu.. 	#//h/////  [   8}}
** [*4[ [@CH[ [ [   !II+II!
I BAAAyAAA ====9===HH&55 	:N 	++- &

+<d C C"3#5	 ,  
 fhh  3{++,, 7 7{1~&&**(0KN1%%N))(1+6666 s;''(( 	J 	JA8##N))*;*FGXG]*^___/:;L;QRG #N;q>7<M<X<_<_<a<abbbb$%qT%q%qfn%q%q%qrrr%'2C\*R*R%`````:K:V:]:]:_:_````` VN))*;*F|*TUUU &75-/GHH5  N2&*;*RRRA%%&7&NOOOO!358CAr8JN_Nv8v8v D   "-Q!4A JIQIIIKNN{%*T[QQQXXYceghhhr+     r   r   c                    ||t          d          ||t          d          |!d|ddddd|f         i}|j        d         }n6|4d|i}t          |t                    r|d         j        d         n|d         }|p| j        }t          j        |df| j        t
          j                  |j	        z  }t          j
                    5   | di |d	|ij        ddd
f         }ddd           n# 1 swxY w Y   t          j        |d         t
          j                  }	d|	t          |j                                                  <   t"          j         |dd|	f<   |                    d
          }
|
S )a  
        Detects language from log-mel input features or encoder_outputs

        Parameters:
            input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*):
                Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by
                loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
                the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
                [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
                tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details.
            encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
                Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
                `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
                hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            num_segment_frames (`int`, *optional*, defaults to 3000):
                The number of log-mel frames the model expects

        Return:
            A `torch.LongTensor` representing the detected language ids.
        Nz@You have to specify either `input_features` or `encoder_outputs`zTMake sure to specificy only one of `input_features` or `encoder_outputs` - not both!r   r   r   r   re   r/   r   r   r.   Fr]   )r!   r"   rJ   r   r   rh   r2   re   r  r  no_gradrT  	ones_liker   r   r  rc  r1   r4   argmax)r   r   r   r   r   r   r   r   rT  non_lang_maskr  s              r)   r  z&WhisperGenerationMixin.detect_language  s   D !o&=_```'O,Gsttt'&qqq!!!=P>P=P7P(QRF'-a0JJ('9F/9/?/[/[s"(++apqras  .G1GJ
At{%*MMM67 	
 ]__ 	W 	WTHHFHH6GHHHOPQPQPQSUPUVF	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W 	W q	DDDEJd,7>>@@AAB$&F7qqq- ==$$s   C44C8;C8c                     |                      dd           }|                      dd           }||t          d          d S d S )Nr   assistant_modelzQPassing `decoder_input_ids` is deprecated. Consider passing `prompt_ids` instead.)r  r!   )r   r   r  s      r)   r  z/WhisperGenerationMixin._check_decoder_input_ids  sY    "JJ':DAA **%6==(_-Hc   )(-H-Hr+   c                     | rkt          |dd           dk    rt                              d           t          |d          st	          d          |                    dd           |_        d S d S )Nr   	translatez@Token-level timestamps may not be reliable for task 'translate'.r   zModel generation config has no `alignment_heads`, token-level timestamps not available. See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.r   )rP   r  r  r  r!   r   r   r   s      r)   r  z&WhisperGenerationMixin._set_num_frames  s    " 	J(&$77;FFabbb,.?@@  R   ,2::lD+I+I(((	J 	Jr+   c                     ||nt          | dd           | _        ||nt          | dd           | _        ||nt          | dd           | _        ||nt          | dd           | _        d S )Nr   r   r   r   )rP   r   r   r   r   r   s        r)   r  z4WhisperGenerationMixin._set_thresholds_and_condition  s     !, *,?FF 	+ +6 ('*,I4PP 	5 #.  *,A4HH 	- (3 %$*,FMM 	222r+   c                     ddg}|p|d         }||vr(t          d| dd                    |                     | j        dur|dk    rt          d          || _        d S )	Nr   all-segmentsr   z`prompt_condition_type=zD does not exist. Make sure to set `prompt_condition_type` to one of r0   TzeMake sure to set `condition_on_prev_tokens=True` when setting `prompt_condition_type='all-segments'`.)r!   joinr   r   )r   r   allowed_cond_typess      r)   r  z1WhisperGenerationMixin._set_prompt_condition_type  s    -~> !6 N9KA9N (::: e*?  e  e  FJ  FO  FO  Pb  Fc  Fc  e  e   5TAAF[_mFmFmw   3H///r+   c                 >    | | nt          |dd          } | |_        d S )Nr   F)rP   r   r   s     r)   r	  z4WhisperGenerationMixin._set_condition_on_prev_tokens  s:     (3 %$*,FNN 	!
 6N222r+   c                    | dk    r|s|t          d          | dk    rh|sf|                    d                                                              t          j                  }t	          j        | ft          j                  }nEt	          j        | ft          j                  |z  }t	          j        | ft          j                  }||fS )Nr   zWhen doing batched long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` r   r.   )r!   r   r   rv  rh   r  r   r2   )r   r   r   r   r   r   s         r)   r
  z4WhisperGenerationMixin._retrieve_max_frames_and_seek  s    >>,>>3I O   !^^L^'++B//335588DDJ;
}EJ???DDZMDDDGYYJ;
}EJ???D4r+   c                    |j         du rt          ||          }||gn|g|z   }|j        (t          |j        |          }||gn|g|z   }d |_        |j        )t          |j        ||          }||gn|g|z   }d |_        |j        >t          |j        dz
  ||dk              }	||	gn|	g|z   }|		                    |            |S )NT)r   rd   )r   re   r   )no_speech_tokenr   scores_is_logprobs)
r   r   r   r   begin_suppress_tokensr   r   r   r  	set_model)
r   r   rQ   r   r   re   timestamp_processorsuppress_tokens_processorbegin_suppress_processorno_speech_detectors
             r)   r  z1WhisperGenerationMixin._retrieve_logit_processors*  s`   .$66"ABSal"m"m"m)9)A$%%H[G\_oGo  ,8(EFWFgpv(w(w(w% $+ +++/03CC 
 15-2>'K!7[Y_( ( ($
 $+ ***./2BB 
 7;30<!9 1 H1 L'#,q=" " " )9(@#$$GYFZ]mFm  ((...r+   c                    |}g }t          |          D ]j}||         }||         ||         k    r9|||z
  z   }	|dz  }t          j        | d |	         | |	dz   d          gd          } U|                    |           k| ||fS )Nr   r   rb   )r5   rh   ri   r6   )
r   r   r   r   r   prev_bsznew_batch_idx_mapr>   r  	cut_indexs
             r)   r  z*WhisperGenerationMixin._maybe_reduce_batchR  s    x 	1 	1A"1%FF|z&1118!34	1!&N:I:,FW`cdWdWfWfHg+hno!p!p!p "((0000w(999r+   c                 \   | d S g }t          |          D ]}||         }| ||dz   d d ||         ||         ||         z   f         }	|	j        d         |k     r&t          j        |	d||	j        d         z
  f          }	|                    |	           t          j        |d          }|S )Nr   r   r   )r$   rb   )r5   r"   rl   r$   r6   rh   ri   )
r   r   r   r   r   r   r   r>   r  segment_input_slices
             r)   r  z)WhisperGenerationMixin._get_input_segmentb  s    !4w 
	6 
	6A"1%F"0QUAAAtF|dSYl]lms]tNt?t1t"u"(,/AAA&'e'a1CFYF_`bFc1c-d' ' '#   !45555	-Q777r+   c           	          d|
v r|
                     d          }||
fS |j        dz  dz
  }||         }t          |dd           }||	|	d         nd }t                    rt	          d                   dk    rfd|D             }||j        dk    r|}n2t          j        | df|t          j        	          }|||d         z  nd }|j	        d
k    rdnd}t          ||j        |d|||          }t          j        ||gd          }||j        k    |
d<   nn|V|d                              |j        d         d          }t          j        ||gd          }|
                     dd            n|
                     dd            ||
fS )Nr   r   r   rI  r   r   c                 4    g | ]}|         r|         nd S rI   r]   )rK   r>   rn   r   s     r)   r`   zEWhisperGenerationMixin._prepare_decoder_input_ids.<locals>.<listcomp>  s2    vvvcd6QRS6T^/22Z^vvvr+   r  r  r/  rY   rV   rX   )re   rp   rq   rr   rs   r   rb   r0  )r   max_target_positionsrP   r  rg   r   rh   r2   r  r3  ry   ro   ri   r   r"   )r   r   rn   r   r   r   r   r   re   r   r   r   rs   prev_start_of_textactive_segmentsprev_ids
one_tensorrq   prev_tokenss     ` `              r)   r  z1WhisperGenerationMixin._prepare_decoder_input_idsx  s    &(( &

+> ? ?$f,,49A='6$%68KTRR%8G8S!4!4Y]*++ 	74DQ4G0H0H10L0LvvvvvhuvvvO%*;*QUc*c*c%"Z!V5:VVV
ASA_-
1==ei&7&LPX&X&Xll^gG,!.#!)-  K !&	;8I*JPR S S S/@DUDb/bF+,,#$T*112C2I!2LaPPK %	;8I*JPR S S SJJ/6666 JJ/666 &((r+   c                 |   |j         |j         nd}||j        d         z   | j        j        k    rLt	          d|j        d          d| d||j        d         z    d| j        j         d| j        j         d          t          |j        d	z  d
z
  |j        d         d
z
            }|j        L|j         Et          |j        |z   |j                  }t                              d|j         d| d           d S |j         <|j         |j        d         z   |j        k    r |j        |j        d         z
  }||_         d S d S d S )Nr   r   zjThe length of `decoder_input_ids`, including special start tokens, prompt tokens, and previous tokens, is z,  and `max_new_tokens` is zL. Thus, the combined length of `decoder_input_ids` and `max_new_tokens` is: z@. This exceeds the `max_target_positions` of the Whisper model: z. You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, so that their combined length is less than r  r   r   zIncrease max_length from z to z0 since input is conditioned on previous segment.)	r  r"   r   r  r!   minrY   r  r  )r   r   r   r   r  num_initial_tokensrY   s          r)   r  z5WhisperGenerationMixin._set_max_new_tokens_and_length  s   =N=]=i*99op-3B77$+:ZZZb  ~O  ~U  VX  ~Y b b,:b b@NQbQhikQl@lb b AE@`b b
 ?Ck>^b b b   !!<!AA!EGXG^_aGbefGfgg '38I8X8`.9<NNPVPkllJKK K,=,H  K  Kj  K  K  K     ,8!03D3J23NNQWQlll#8;L;RSU;VVN/=,,,	 98llr+   c                    t          t          j        |          dz            dz   d                    fd|                                 D                       }t          |          t          t          j        |                    z  }|S )zUCompute byte length of zlib compressed token bytes vs. byte length of raw token bytes   r   r+   c                 <    g | ]}|                     d           S )little)to_bytes)rK   rC   lengths     r)   r`   zFWhisperGenerationMixin._retrieve_compression_ratio.<locals>.<listcomp>  s'    VVV

68 < <VVVr+   )r   mathlog2r  r  rg   zlibcompress)r\   r8  token_bytesr  r  s       @r)   r  z2WhisperGenerationMixin._retrieve_compression_ratio  s     TYz**Q.//!3hhVVVVfmmooVVVWW,,s4=3M3M/N/NN  r+   c                 X   |dk    r|nd}t          j        |                               j                  } | j        d         j        d         k    r| d j        d                  } n| j        d          d          t          j        | |z                                  d                              | j                  t          fdt          j        d                   D                       }k    	                    d          nj        d         }||dz   z  }|S )Nr-  r   r   r   rb   c              3   Z   K   | ]%}|         |                  |         k    z  V  &d S rI   r]   )rK   r>   r  r  r\   s     r)   rN   z@WhisperGenerationMixin._retrieve_avg_logprobs.<locals>.<genexpr>  s?      ttVWHQKq	2fQi<6OPttttttr+   )rh   rm   rv  re   r"   rl   log_softmaxfloatr/   r   r5   )	rN  r\   r  r   rescale_temperaturesum_logprobsr  avg_logprobsr  s	    ``     @r)   r  z-WhisperGenerationMixin._retrieve_avg_logprobs  s5   -83->->kkAV$$''66<?V\!_,,-fl1o-.FFV\!_,../F=&+>">!E!E!G!GRPPPSSTZT`aa tttttt[`aiaopqar[s[sttttt5A5M&L(--b111SYS_`aSb#vz2r+   c
                    |                      |          }
|
dd                                          ddgk    }t          j        |
d d         |
dd          z            d         }|                    d           |	r||         d         ng }t          |          dk    r|                                }g }|r"|                    t          |                      d}|D ]}| ||         }|d                                         |z
  }|d                                         |z
  }|                    ||         ||z  z   ||         ||z  z   |||         d           |	r|||         ||         z   |d         d<   |}|r	||         }n| |dz
                                           |z
  }||z  }n| |
                                	                                         }||         }|
                                dk    r;|d                                         |k    r|d                                         |z
  }||         ||         ||z  z   | ||         dg}|	r|||         z   |d         d<   ||         }||fS )	Nr   FTr   r   r   r   )startendr\   r(   )ger  rh   whereadd_rg   r6   itemnonzeroflattennumel)r   r   r   r   r   r   r   r   r   r   timestamp_tokenssingle_timestamp_endingtimestamp_segment_indicesr   slicesr   
last_slicecurrent_slicesliced_tokensstart_timestamp_posend_timestamp_posr   last_timestamp_posr   s                           r)   r  z(WhisperGenerationMixin._retrieve_segment  s    *7)9)9/)J)J"2233"7">">"@"@UDM"Q$)K0@"0EHXYZY[Y[H\0\$]$]^_$`!!&&q)))D[c<,-?@@ac ())A--.5577FH& 2c-00111J!' + + -j.F G&3A&6&;&;&=&=&O#$1"$5$:$:$<$<$N!!,X!69L~9]!]*847H>7YY"/".s"3	    + (M)AB[QYEZZ RL!34 +

& C!0!:
 &3:>%B%G%G%I%IO%["!3l!B ''7'?'?'A'A'I'I'K'KLJ!0!:!!A%%*R.*=*=*?*??*R*R%/^%8%8%:%:_%L" )2&x03E3VV+*3/	 H ' \3CkRZF[3[/0,X6N''r+   )r|   NN)NNNNNFNNNNNNNNNNNNNr|   NFN)NNNr  )3__name__
__module____qualname__r   r   rh   r   r   r   r   r   r   r   r   r  r   r
  r   r$  r  staticmethodr  r6  r  r7  r  r2  r   r  r   r   r  r  FloatTensorr   r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  __classcell__)rG  s   @r)   r{   r{      sW       eio o o of 268<:><@W[!,0"48*.-1/337AE7;-1/3,015 $26 %261q q .q $$45q ##67	q
 $$89q #+8S%,4Gc4R+S"Tq q $D>q smq 5d3i01q "$q U\*q  (}q #+4.q eE5+<$<=>q  &.e_!q" $E?#q$ &e_%q& %SM'q( !.)q* +q, "*$-q. /q0 "*$1q q q qfVi Vi Vi Vi Vip     \ A- A- A-F#, #, #,J*+ *+ *+X
 
 
6 f f \f
 m m \m     \ D ' ' \'$  @ !* !* \!*FUi Ui Uir 7;OS8<"&? ? !23? "%(9?(J"KL? $$45	?
  ? 
? ? ? ?B   \ 	J 	J \	J 
 
 \
8 H H \H$ N N \N     \ &  &  & P : : \:   \* ;) ;) \;)z> > >6 ! ! \!   \$ N( N( \N( N( N( N( N(r+   r{   )rU   rV   NN)1r1  r  r   r  typingr   r   r   r   r   r   r   r1   rh   torch.nn.functionalr	   r#   rl   transformers.cache_utilsr
   
generationr   r   generation.logits_processr   r   r   r   r   generation.stopping_criteriar   modeling_outputsr   utilsr   tokenization_whisperr   r   
get_loggerr"  r  r   r   r*   r   rF   rT   ry   r{   r]   r+   r)   <module>r2     s-       C C C C C C C C C C C C C C C C                     8 8 8 8 8 8 ; ; ; ; ; ; ; ;              A @ @ @ @ @ / / / / / /       < < < < < < < < 
	H	%	%5< s u|    *3&"* 3& 3& 3& 3&l   , , , ,^M( M( M( M( M(_ M( M( M( M( M(r+   