
    gk                     &   d dl Z d dlmZ d dlmZmZmZmZ d dlZ	d dl
Z
ddlmZ ddlmZmZmZ ddlmZ dd	lmZ erd d
lmZ ddlmZ ddlmZ  ej        e          Z e            r
d dlZddlm Z  d Z!ddZ"d Z#d Z$ G d de          Z%d Z&dS )    N)defaultdict)TYPE_CHECKINGDictOptionalUnion   )PreTrainedTokenizer)is_torch_availableis_torchaudio_availablelogging   )ffmpeg_read)ChunkPipeline)BeamSearchDecoderCTC)SequenceFeatureExtractor)PreTrainedModel)(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMESc                    g }| D ]\  }}}t          t          ||z                      }t          t          ||z  |z                      }t          t          ||z  |z                      }|||f}|                    |           |S )z
    Rescales the stride values from audio space to tokens/logits space.

    (160_000, 16_000, 16_000) -> (2000, 200, 200) for instance.
    )introundappend)striderationew_stridesinput_nleftrighttoken_n
new_strides           o/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.pyrescale_strider!   )   s     K & ' 'ueGeO,,--5'12233E%'/G34455tU+
:&&&&    c              #   ^  K   | j         d         }||z
  |z
  }t          d||          D ]}||z   }	| ||	         }
 ||
|j        d          }||                    |          }|dk    rdn|}|	|k    }|rdn|}|
j         d         }|||f}|
j         d         |k    r	||d|V  |r d S d S )Nr   pt)sampling_ratereturn_tensorsdtype)is_lastr   )shaperanger%   to)inputsfeature_extractor	chunk_lenstride_leftstride_rightr(   
inputs_lenstepchunk_start_idxchunk_end_idxchunk	processed_stride_leftr)   _stride_rightr   s                   r    
chunk_iterr:   =   s     aJ{"\1D J55  ')3}45%%e;L;Zkoppp	!511I+q00qqk:-$6,KN	\=9;q>L((%EE9EEEE 	EE	 r"   c                 .  
 t          |           }t          |          

fdt          |dz             D             }d}t          |          D ]P}t          
          D ]>}| |         ||         k    r*||         |         dz   }|||dz            |dz   <   ||k    r|}?Qt          j        |          }|dk    r't          j        ||k              d         d         |z
  nd}|dk    r't          j        ||k              d         d         |z
  nd}	||	|fS )Nc                 "    g | ]}d gdz   z  S )r   r    ).0_seq_len_rights     r    
<listcomp>z6_fast_find_longest_common_sequence.<locals>.<listcomp>U   s%    JJJQsma'(JJJr"   r   r   )lenr+   nparrayargwhere)sequence_leftsequence_rightseq_len_leftcounterlongestijprevious_counter
index_leftindex_rightr@   s             @r    "_fast_find_longest_common_sequencerQ   R   sL   }%%L''MJJJJ%q8H2I2IJJJGG<   / /}%% 	/ 	/AQ>!#444#*1:a=1#4 (8Aq1u%#g--.G	/ hwGELPQ\\W/004Q7'AAWYJFMQRll"+g011"5a87BBXZK{G++r"   c           
      F   fd| d         d                                          D             }| dd          D ]}fd|d                                          D             }d}d}t          dt          |          dz             D ]l}|dz  }t          j        t          j        || d                    t          j        |d |                   k              }	|	|z  |z   }
|	dk    r
|
|k    r|}|
}m|                    ||d                     t          j        |          S )Nc                 &    g | ]}|j         v|S r=   all_special_idsr>   tok_id	tokenizers     r    rA   z1_find_longest_common_sequence.<locals>.<listcomp>m   s&    iii6yOhAhAhAhAhAhr"   r   r   c                 &    g | ]}|j         v|S r=   rT   rV   s     r    rA   z1_find_longest_common_sequence.<locals>.<listcomp>o   s'    lll6FR[RkDkDkDkDkDkr"   g        g     @)tolistr+   rC   rD   sumrE   extend)	sequencesrX   sequencenew_seqnew_sequenceindexmax_rL   epsmatchesmatchings    `         r    _find_longest_common_sequencerf   f   s?    jiiiYq\!_%;%;%=%=iiiHQRR= . .llllWQZ->->-@-@lllq#l++a/00 	  	 Ag+CfRXhrssm44bqbAQ8R8RRSSG{S(H{{x$UVV,----8Hr"   c                       e Zd ZdZ	 	 	 	 	 ddddedef         dee         deed	ef                  d
eedf         deeedf                  f fdZ	dee
j        eef         f fdZ	 	 	 	 	 	 	 	 ddZddZddZ	 ddee         fdZ xZS )"AutomaticSpeechRecognitionPipelinea  
    Pipeline that aims at extracting spoken text contained within some audio.

    The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for
    to support multiple audio formats

    Example:

    ```python
    >>> from transformers import pipeline

    >>> transcriber = pipeline(model="openai/whisper-base")
    >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
    {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    Arguments:
        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow.
        feature_extractor ([`SequenceFeatureExtractor`]):
            The feature extractor that will be used by the pipeline to encode waveform for the model.
        tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
            [`PreTrainedTokenizer`].
        decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
            [PyCTCDecode's
            BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
            can be passed for language model boosted decoding. See [`Wav2Vec2ProcessorWithLM`] for more information.
        chunk_length_s (`float`, *optional*, defaults to 0):
            The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).

            <Tip>

            For more information on how to effectively use `chunk_length_s`, please have a look at the [ASR chunking
            blog post](https://huggingface.co/blog/asr-chunking).

            </Tip>

        stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
            The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
            the model to *see* more context and infer letters better than without this context but the pipeline
            discards the stride bits at the end to make the final reconstitution as perfect as possible.

            <Tip>

            For more information on how to effectively use `stride_length_s`, please have a look at the [ASR chunking
            blog post](https://huggingface.co/blog/asr-chunking).

            </Tip>

        framework (`str`, *optional*):
            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
            installed. If no framework is specified, will default to the one currently installed. If no framework is
            specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
            no model is provided.
        device (Union[`int`, `torch.device`], *optional*):
            Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
            model on the associated CUDA device id.
        torch_dtype (Union[`int`, `torch.dtype`], *optional*):
            The data-type (dtype) of the computation. Setting this to `None` will use float32 precision. Set to
            `torch.float16` or `torch.bfloat16` to use half-precision in the respective dtypes.

    Nmodelr   r.   r   rX   decoderr   deviceztorch.devicetorch_dtypeztorch.dtypec                 6   |j         j        dk    rd| _        n`|j        j        t          j                    v rd| _        n9|j        r+|j                            d          r||| _	        d| _        nd| _         t                      j        |||f||d| d S )Nwhisperseq2seq_whisperseq2seqWithLMctc_with_lmctc)rk   rl   )config
model_typetype	__class____name__r   values_processor_classendswithrj   super__init__)	selfri   r.   rX   rj   rk   rl   kwargsrw   s	           r    r}   z+AutomaticSpeechRecognitionPipeline.__init__   s     <"i//)DII_%)Q)X)Z)ZZZ!DII.	!2;;HEE	 #"DL%DIIDI	+<oVYdoohnooooor"   r-   c                 8     t                      j        |fi |S )ae  
        Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]
        documentation for more information.

        Args:
            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
                The inputs is either :
                    - `str` that is either the filename of a local audio file, or a public URL address to download the
                      audio file. The file will be read at the correct sampling rate to get the waveform using
                      *ffmpeg*. This requires *ffmpeg* to be installed on the system.
                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
                      same way.
                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
                        Raw audio at the correct sampling rate (no further check will be done)
                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw":
                      np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
                      treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
                      inference to provide more context to the model). Only use `stride` with CTC models.
            return_timestamps (*optional*, `str` or `bool`):
                Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
                other sequence-to-sequence models.

                For CTC models, timestamps can take one of two formats:
                    - `"char"`: the pipeline will return timestamps along the text for every character in the text. For
                        instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
                        0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
                        `0.6` seconds.
                    - `"word"`: the pipeline will return timestamps along the text for every word in the text. For
                        instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
                        (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
                        before `0.9` seconds.

                For the Whisper model, timestamps can take one of two formats:
                    - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted
                        through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
                        by inspecting the cross-attention weights.
                    - `True`: the pipeline will return timestamps along the text for *segments* of words in the text.
                        For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
                        model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
                        Note that a segment of text refers to a sequence of one or more words, rather than individual
                        words as with word-level timestamps.
            generate_kwargs (`dict`, *optional*):
                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                complete overview of generate, check the [following
                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).

        Return:
            `Dict`: A dictionary with the following keys:
                - **text** (`str`): The recognized text.
                - **chunks** (*optional(, `List[Dict]`)
                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
                    "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
                    `"".join(chunk["text"] for chunk in output["chunks"])`.
        )r|   __call__)r~   r-   r   rw   s      r    r   z+AutomaticSpeechRecognitionPipeline.__call__   s%    z  uww11&111r"   c	                    i }	|,| j         dk    r|st                              d           ||	d<   |||	d<   t          t                    }
|t          j        dt                     ||
d<   |*|d|v rt          d          |
	                    |           i }|||d<   || j         dk    r|rt          d	          | j         d
k    r|dk    rt          d          | j         dk    r|dvrt          d          | j         dk    r|dk    rt          d          ||
d<   ||d<   || j         dk    rt          d          ||d<   |	|
|fS )Nrp   a  Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily be entirely accurate and will have caveats. More information: https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(..., ignore_warning=True)chunk_length_sstride_length_sz`max_new_tokens` is deprecated and will be removed in version 4.49 of Transformers. To remove this warning, pass `max_new_tokens` as a key inside `generate_kwargs` instead.max_new_tokenszp`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use only 1 versiondecoder_kwargszEWe cannot return_timestamps yet on non-CTC models apart from Whisper!rr   wordzRCTC with LM can only predict word level timestamps, set `return_timestamps='word'`rs   )charr   zCTC can either predict character level timestamps, or word level timestamps. Set `return_timestamps='char'` or `return_timestamps='word'` as required.ro   r   zWhisper cannot return `char` timestamps, only word level or segment level timestamps. Use `return_timestamps='word'` or `return_timestamps=True` respectively.return_timestampsz)Only Whisper can return language for now.return_language)
rv   loggerwarningr   dictwarningswarnFutureWarning
ValueErrorupdate)r~   r   r   ignore_warningr   r   r   generate_kwargsr   preprocess_paramsforward_paramspostprocess_paramss               r    _sanitize_parametersz7AutomaticSpeechRecognitionPipeline._sanitize_parameters  s    %yI%%n%,   3A./&3B/0$T**%M    0>N+,&).>/.Q.Q &   !!/222%3A/0(yI%%*;% !hiiiyM)).?6.I.I !uvvvyE!!&7?O&O&O `   y---2Cv2M2M _   3DN./6G23&y--- !LMMM4C01 .2DDDr"   r   c              #   
  K   t          |t                    r|                    d          s|                    d          rt          j        |          j        }n<t          |d          5 }|                                }d d d            n# 1 swxY w Y   t          |t                    rt          || j
        j                  }d }i }t          |t                    r|                    dd           }d|v rd|v sd|v st          d          |                    dd           }|,|                    d	d            |                    dd           }|                    d          }|}|}|| j
        j        k    ryt                      rd
dlm}	 nt%          d          |	                    t)          j        |          || j
        j                                                  }| j
        j        |z  }
nd}
||d
         |d         z   |j        d
         k    rt          d          |j        d
         t1          t3          |d
         |
z                      t1          t3          |d         |
z                      f}t          |t4          j                  s t9          dt;          |           d          t=          |j                  dk    rt          d          |r||dz  }t          |t0          t>          f          r||g}tA          | j!        j"        dd          }t1          t3          || j
        j        z  |z            |z            }t1          t3          |d
         | j
        j        z  |z            |z            }t1          t3          |d         | j
        j        z  |z            |z            }|||z   k     rt          d          tG          || j
        |||| j$                  D ]}|V  d S | j        dk    rA|j        d
         | j
        j%        k    r&| 
                    || j
        j        dddd          }nm| j        dk    r?|=| 
                    || j
        j        ddd          }|                    d          |d<   n#| 
                    || j
        j        dd          }| j$        |&                    | j$                  }|| j        dk    rt          d           ||d<   d!di||V  d S )"Nzhttp://zhttps://rbr   r%   rawrE   zWhen passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a "raw" key containing the numpy array representing the audio and a "sampling_rate" key, containing the sampling_rate associated with that arraypathr   )
functionalztorchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. The torchaudio package can be installed through: `pip install torchaudio`.r   zStride is too large for inputz)We expect a numpy ndarray as input, got ``zMWe expect a single channel audio input for AutomaticSpeechRecognitionPipeline   inputs_to_logits_ratioz.Chunk length must be superior to stride lengthro   FrK   r$   T)r%   
truncationpaddingr&   return_attention_mask)r%   r&   return_token_timestampsr   
num_frames)r%   r&   r   r'   rp   z8Stride is only usable with CTC models, try removing it !r)   )'
isinstancestr
startswithrequestsgetcontentopenreadbytesr   r.   r%   r   popr   r   
torchaudior   ImportErrorresampletorch
from_numpynumpyr*   r   r   rD   ndarray	TypeErrorrv   rC   floatgetattrri   rt   r:   rl   	n_samplesr,   )r~   r-   r   r   fr   extra_inputsin_sampling_rateFr   align_tor/   r0   r1   itemr7   s                    r    
preprocessz-AutomaticSpeechRecognitionPipeline.preprocessa  s     fc"" 	&  ++ &v/@/@/L/L & "f--5&$'' &1VVXXF& & & & & & & & & & & & & & & fe$$ 	O )?)MNNFfd## *	iZZ$//F $v--5F??gQWFWFW N   jj--G

64((( **Wd33%zz/::EF4#9#GGG*,, :::::::%e  
 $V,,.>@V@d %''  .<?OO!!9vay(6<?::$%DEEE !,q/3uVAY5F/G/G+H+H#eTZ[\T]`eTeNfNfJgJgh&"*-- 	YWVWWWXXXv|!!lmmm 8	:&"01"4/C<88 E#2O"D
 tz02JANNHE.43I3W"WZb"bccfnnooIeOA$69O9]$]`h$hiilttuuKu_Q%7$:P:^%^ai%ijjmuuvvL;555 !QRRR".	;VZVf    



 
 y---&,q/DDZDd2d2d 22"&"8"F$%#'*. 3  		 9 111fn $ 6 6&*&<&J'+04.2 !7 ! !I +4--*E*EE,'' $ 6 6&*&<&J'+.2	 !7 ! !I +%LLt/?L@@	!9	))$%_```&,	(#d9i95999999s   ,BBBFc                     |                     dd           }|                     dd           }|                     dd           }|                     d          }||t          d           j        dv r@d|v r|                     d          }n>d|v r|                     d          }n$t          d	|                                           |rg j        d
k    r\||d<   |dk    rQd|d<   d|d<   |@t	          |t
                    r|d          j        j        z  |d<   n fd|D             |d<   n||d<   d|vr
 j        |d<     j	        j
        d||d|}	|dk    r? j        d
k    r4d|	vr|	d         |	d         d}
n"d |	d         D             }|	d         |d}
nd|	i}
 j        d
k    r|||
d<   n j	        j        |                      j	        j                  d|i}  j	        di |}|j        } j        dk    rd|i}
nd|                    d          i}
|Wd j	        j        j        z  }t	          |t
                    rt!          |g|          d         |
d<   nt!          ||          |
d<   |}d|i|
|S ) Nattention_maskr   r   r)   z0num_frames must be used only when stride is None   rp   ro   input_featuresinput_valueszhSeq2Seq speech recognition model requires either a `input_features` or `input_values` key, but only has ro   r   r   Tr   return_segmentsr   c                 :    g | ]}|d          j         j        z  S )r   )r.   
hop_length)r>   sr~   s     r    rA   z?AutomaticSpeechRecognitionPipeline._forward.<locals>.<listcomp>  s)    <w<w<wklQqTTE[Ef=f<w<w<wr"   generation_config)r-   r   segmentsr]   token_timestamps)tokensr   c                 J    g | ] }t          j        d  |D                       !S )c                     g | ]
}|d          S )r   r=   )r>   segments     r    rA   zJAutomaticSpeechRecognitionPipeline._forward.<locals>.<listcomp>.<listcomp>  s    "["["[77+=#>"["["[r"   )r   cat)r>   segment_lists     r    rA   z?AutomaticSpeechRecognitionPipeline._forward.<locals>.<listcomp>  sA     ( ( (( 	"["[l"["["[\\( ( (r"   r   rr   logitsrB   )dimr   r=   )r   r   rv   keysr   tupler.   r   r   ri   generatemain_input_namer   argmaxrt   r   r!   )r~   model_inputsr   r   r   r   r   r)   r-   r   outr   outputsr   r   r   s   `               r    _forwardz+AutomaticSpeechRecognitionPipeline._forward  sx   %))*:DAA!!(D11!%%lD99
""9--*"8OPPP9666  <//%))*:;;<//%)).99 bLXL]L]L_L_b b   ! CTY2C%C%C7H 34$..AEO$=>9=O$56)%fe44 x<B1II_Ij<jOL99<w<w<w<wpv<w<w<wOL998B5 #/997;7M 34(TZ( -  " F !F**ty<M/M/MV++%+K%8fUgNhiiCC( (,2:,>( ( ($ &,K%8N^__CC(y---%$*CM 
*L,<,<TZ=W,X,X .F !dj**6**G^FyM))(2!6!67! DJ-DDfe,, B$2F8U$C$CA$FCMM$265$A$ACM73c3U33r"   r   c                    i }g }| j         dk    rdnd}d }|D ]}	| j        dk    r]|	|         j        t          j        t          j        fv r8|	|                             t          j                                                  }
n|	|                                         }
|		                    dd           }|"| j         dv r|\  }}}||z
  }|
d d ||f         }
|
                    |
           |r!| j         dk    rt          || j                  }
n| j         dk    rx| j        j        | j        j        j        z  }| j        j        }|D ])}d|v r#|d         \  }}}||z  }||z  }||z  }|||f|d<   *| j                            ||||	          \  }}n+t)          j        |d
          }
|
                    d          }
| j         dk    r\|i } | j        j        |
fi |}|d         d         }|r4|d         d         }g }|D ]!\  }\  }}|
                    |||d           "n| j         dk    rw| j         dk    }| j                            |
|          }|rN| j                            |
|d          d         }|dk    r%| j                            || j        j                  }|r| j         dvr~g }|D ]t}|d         | j        j        j        z  }|| j        j        z  }|d         | j        j        j        z  } | | j        j        z  } |
                    ||         || fd           u||d<   t;          t<                    }!|D ]}|                    dd            |                    dd            |                    dd            |                    dd            |                    dd            |                                 D ] \  }"}#|!|"         
                    |#           !d|i||!S )Nrr   r   r   r$   r   >   rs   rr   rp   ro   )r   r   time_precisionr   )axisr   r   )r   start_offset
end_offsetrs   )skip_special_tokensT)r   output_char_offsetschar_offsetsr   r   r   r   )text	timestampchunksr)   r   r   )!rv   	frameworkr(   r   bfloat16float16r,   float32r   r   r   rf   rX   r.   chunk_lengthri   rt   max_source_positionsr%   _decode_asrrD   concatenatesqueezerj   decode_beamsdecode_get_word_offsetsreplace_word_delimiter_charr   r   listr   items)$r~   model_outputsr   r   r   optionalfinal_itemskeyr   r   r   total_nr   r   right_nr   r%   outputr/   r0   r1   r   beamschunk_offsetoffsetsr   r   r   r   r   r   startstopr   kvs$                                       r    postprocessz.AutomaticSpeechRecognitionPipeline.postprocess1  s    )}44hh($ 	& 	&G~%%'#,*<QVQ^@_*_*_66<<>>**,,[[400F!di3I&I&I'-$u
 "E/aaago.u%%%% 	%di9,,1+t~NNEEY+++!3@4:CTCiiN 2@M' L Lv%%;A(;K8I{L.I=0K M1L'0+|'KF8$!^77"3 /-	 8  ND(( N;Q777EMM!$$E9%%%!#-DL-eFF~FFE8A;D  k  %Qx{8D k k4D4<NND,^h#i#ijjjjY+++"&)u"4>((DW(XXD  t.///BX\ 0   " %.."n>>wHrssG 
	(2P!P!PF ] ]^,tz/@/WW/==L)DJ,=,TT.<<t,='>eUY][[\\\\!'HXD!!# 	# 	#FJJx&&&JJx&&&JJy$'''JJx&&&JJ)4000 # #1a""""#22E22r"   )NNNNN)NNNNNNNN)r   N)F)NNN)rx   
__module____qualname____doc__r   r   r   r	   r   r}   rD   r   r   r   r   r   r   r   r  __classcell__)rw   s   @r    rh   rh      s       A AL EI37@D-1;?p p p !!;S!@Ap /0	p
 % 6 ;<=p c>)*p eC$678p p p p p p6=2bj%,-=2 =2 =2 =2 =2 =2B BE BE BE BEHw: w: w: w:rU4 U4 U4 U4p mq]3 ]3-5d^]3 ]3 ]3 ]3 ]3 ]3 ]3 ]3r"   rh   c                 	   |                     d          dz   }g }|j        |z  }d}t          |           D ]g\  }}	|	\  }
}t          |
t                    rt          j        |
          }
|\  }}}|
                    d          }
||
v r$t          j        |
|k              d         d         nd}|
|d         }
|
|k    }|dk    rt          |          dk    rt          j        |dd         |dd         z            d         dz   }t          j        |          d         d         }||vrt          j
        ||          n|}|||z   z  }t          ||j        z  |z            }t          ||j        z  |z            }t          j        |
|         ||z   k              d         }|j        d         dk    r|d         dk    r||d         dz
           n|d         }d}g }t          t          |                    D ]F\  }}|dd         }|d         ||z   |z
  k     r	|dk    r nt          |          dk    rt!          |
d|         |          \  }}}|dk    r||k    r|}|}t          j        |
|dz   d         |k              d         d         dz   }|dz   |z   }|dk    rF|t          |          k    r3t          j        |
|dz   |         d|d                   }|d         |d<   |dk    rO|
|dz   |         }|dk    r|d|dz            n|d         g} t          j        |d|           }|dxx         |z  cc<   Ht          |          dk    r<||t          |          |z
  dz
  <   |dt          |          |z
           }|
|d         }
|
|k    }t          j        |dd         |dd         z            d         dz   }t          |          dk    r>t          j        |          d         d         }||vrt          j
        ||dz             n|}t          |          dk    rhd}!|D ]c}"|dk    s|!dk    r|d         d         n|
d         }#|
|!|"         }$|$d         |$d         z
  }%|#|$d<   |#|%z   |$d<   |
                    |$           |"}!d||z  }ig }&t%          t          |                    D ]}'|&||'                                         z  }& |&S )a  
    Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since
    `WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only
    iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is
    processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to
    properly compute the final `offset`.
    z<|notimestamps|>r   r   NrB   )convert_tokens_to_idsr   	enumerater   r   rD   rE   r   wherer[   r   r   r%   r*   reversedrC   rQ   insertr+   rZ   )(r]   rX   r.   r   timestamp_beginr   r   timeseq_idxr   r^   r   r/   r0   r1   	begin_idxtimestamp_tokensconsecutivelast_timestampoffsetoverlap_timerelevant_timestamp
best_matchsliced_sequenceidxprevious_sequenceprevious_tokensrO   rP   match_lengthbest_idxend_of_curr_sequence_idxprevious_slice
last_slicecurrent_sliceactual_offsetsliced_tokensdurationresultrL   s(                                           r    _find_timestamp_sequencer,    s     556HIIAMOE&36JJND"9-- U U&h%% 	*x))H/5,	;##A&&CRV^C^C^BHX899!<Q??de	IJJ'#6a<<C 011A55(#3CRC#8;KABB;O#OPPQRSVWWKX&677:2>NDRZeDeDe")K@@@kvKK,..D$!2!@@NRSSF.?.M MQ__``L!#(;*??UaCa*a!b!bcd!e!'*Q..>PQR>SVW>W>WK 21 5 9::]hij]k # 
"$.7.H.H > >*C*&7"&=O(+/G,/VWW\_cd\d\d?++a//@b$Q'9%9:OA A=
K (!++z0I0I)5J'*H "*q.2B2B)C)V W WXY Z[\ ]`a a 5 8PRS7SV`7`4)Q<3CWCW3W3W24)$,Z!^>V-V$WYZ\mno\p3" 3" 7H6K 3 3!+q2::>Ld;d2e MXZ[OO$56Ga6G$H$Hbstubvaw !/ 35)OQP^2_2_ / 3 3 3v = 3 3 3''!++7FE#e**x/!34!"9CJJ$9"9:E'(@(A(ABH $6h/47G7KKLLQORSS  1$$X&677:2>N>LT_>_>_	+~'9:::ep  {aJ!, + +18Aqb	"V^_`Va (M)A B(,}Q/??#0a $1H$<b!]+++*

	F3u:: $ $%(//###Mr"   )N)'r   collectionsr   typingr   r   r   r   r   rD   r   tokenization_utilsr	   utilsr
   r   r   audio_utilsr   baser   pyctcdecoder   !feature_extraction_sequence_utilsr   modeling_utilsr   
get_loggerrx   r   r   models.auto.modeling_autor   r!   r:   rQ   rf   rh   r,  r=   r"   r    <module>r8     s    # # # # # # 7 7 7 7 7 7 7 7 7 7 7 7      4 4 4 4 4 4 H H H H H H H H H H $ $ $ $ $ $        1000000LLLLLL000000		H	%	% ULLLTTTTTT  (   *, , ,(  2O3 O3 O3 O3 O3 O3 O3 O3dg g g g gr"   