
    gU[                    p   d Z ddlZddlmZmZmZmZ ddlZddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(  e$j)        e*          Z+dZ,dZ-de	j.        de/de/fdZ0	 dde	j.        de/dee	j.                 fdZ1	 	 ddee/e/f         de2de/dee	j3                 de/dej4        fdZ5 G d d ej6                  Z7 G d! d"ej6                  Z8 G d# d$ej6                  Z9 G d% d&ej6                  Z: G d' d(ej6                  Z; G d) d*ej6                  Z< G d+ d,e	j        j6                  Z= G d- d.ej6                  Z> G d/ d0ej6                  Z? G d1 d2ej6                  Z@ G d3 d4ej6                  ZA G d5 d6ej6                  ZB G d7 d8ej6                  ZC G d9 d:ej6                  ZD G d; d<ej6                  ZE G d= d>ej6                  ZF G d? d@ej6                  ZG G dA dBej6                  ZH G dC dDej6                  ZI G dE dFej6                  ZJ G dG dHej6                  ZK G dI dJe           ZL G dK dLeL          ZM G dM dNeL          ZN G dO dPeL          ZO G dQ dReL          ZP G dS dTeL          ZQ G dU dVeL          ZR G dW dXeL          ZS G dY dZeL          ZT G d[ d\ej6                  ZU G d] d^ej6                  ZVd_ZWd`ZXdaZY e"dbeW           G dc ddeL                      ZZ e"deeX           G df dgeL                      Z[	 	 	 	 	 	 	 	 ddleLde	j\        dmee	j\                 dee	j3                 dne2doe2dpe2dqeej6                 dre]dse]dee	j\        ee	j\        e	j\        f         f         fdtZ^ e"dueX           G dv dweL                      Z_ e"dxeX           G dy dzeL                      Z`d{Za G d| d}ej6                  Zb e"d~ea           G d de                       ZcdS )zPyTorch SpeechT5 model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )SpeechT5ConfigSpeechT5HifiGanConfigr   	input_idspad_token_iddecoder_start_token_idc                     |                      | j                  }| ddddf                                         |ddddf<   ||dddf<   |t          d          |                    |dk    |           |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r    shifted_input_idss       j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr*   4   s     "++IO<<(CRC06688aaae4aaadLMMM""#4#<lKKK    input_valuesreduction_factorattention_maskc                    |dk    r&| dd|dz
  d|f         } ||dd|dz
  d|f         }|                      | j                  }| ddddf                                         |ddddf<   |                    |dk    d           ||fS )zw
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    r   Nr"         Y        )r#   r$   r%   r'   )r,   r-   r.   shifted_input_valuess       r)   shift_spectrograms_rightr3   D   s     !#AAA'7!';'O?O'O$OP%+AAA/?!/C/WGW/W,WXN'11,2DEE".qqq#2#v"6"<"<">">ABB %%&:f&DcJJJ//r+   r$   	mask_probmask_length	min_masksreturnc                 @   | \  }dk     rt          d          k    rt          d d d          t          j                            d                                          fd}|9|                    d                                                                          nfd	t          |          D             }t          j	        |ft          
          }g }	 |          }
|
dk    r|S |D ]} ||          }t          j                            t          j        |dz
  z
            |d          }t          |          dk    rdz
  }n|d         }t          j        |t          j        |
|z
  t          j        
          |z  g          }|	                    |           t          j        |	          }	t          j        |	dddddf         ||
f          }	|	                    ||
z            }	t          j                  ddddf         }t          j        |||
f                              ||
z            }|	|z   }	|	                                dz
  k    rdz
  |	|	dz
  k    <   t          j        ||	dd           |S )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t          | z  z  z             }t          |          }|z  k    rz  }| dz
  z
  |k     rt          | dz
  z
  d          }|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr5   r4   r6   sequence_lengths     r)   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span   s~    i,6DwNOOoy99 [(?::-<O ;?+o==!,+/"BAFFOr+   Nr"   c                     g | ]}S  rC   ).0_r@   s     r)   
<listcomp>z)_compute_mask_indices.<locals>.<listcomp>   s    999!o999r+   dtyper   F)replace)r&   nprandomranditemsumdetachtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper<   put_along_axis)r$   r4   r5   r.   r6   
batch_sizerA   input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr=   r>   spec_aug_mask_idxdummy_mask_idxoffsetsr?   r@   s    `` `           @@r)   _compute_mask_indicesrg   Z   sP   0 #(JQABBB_$$:^i : :'6: : :
 
 	
 innQ$$&&G        $ % 	2%%''..0009999uZ'8'8999  Hj/:$GGGM11/BBa% 5 511,?? I,,IlkAo677RW - 
 
  !!Q&& -q0NN.q1NN(;o(MUWU] ^ ^ ^ao op
 
 	!!"34444"455 111aaa:&5H+(V  ,33J@SVa@abb i$$T4]3Gog
4G'UVV^^'+5 G ,g5 /A"555GVYZGZ-!0CCD m%7B???r+   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5NoLayerNormConvLayerr   c                 Z   t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        d S )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr   feat_extract_activation
activationselfconfiglayer_id	__class__s      r)   rp   z%SpeechT5NoLayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@r+   c                 Z    |                      |          }|                     |          }|S N)rx   rz   r|   hidden_statess     r)   forwardz$SpeechT5NoLayerNormConvLayer.forward   s*    		-0066r+   r   __name__
__module____qualname__rp   r   __classcell__r   s   @r)   ri   ri      sR        A A A A A A      r+   ri   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5LayerNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r   rk   T)elementwise_affine)ro   rp   rq   rr   rs   r   rt   ru   rv   rw   rx   	LayerNorm
layer_normr   ry   rz   r{   s      r)   rp   z#SpeechT5LayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@r+   c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )Nr"   )rx   	transposer   rz   r   s     r)   r   z"SpeechT5LayerNormConvLayer.forward   se    		-00%//B7766%//B7766r+   r   r   r   s   @r)   r   r      sR        A A A A A A      r+   r   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5GroupNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        t          j        | j        | j        d          | _        d S )Nr   r   rk   T)
num_groupsnum_channelsaffine)ro   rp   rq   rr   rs   r   rt   ru   rv   rw   rx   r   ry   rz   	GroupNormr   r{   s      r)   rp   z#SpeechT5GroupNormConvLayer.__init__  s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@,$2CRVRclpqqqr+   c                     |                      |          }|                     |          }|                     |          }|S r   )rx   r   rz   r   s     r)   r   z"SpeechT5GroupNormConvLayer.forward  s;    		-006666r+   r   r   r   s   @r)   r   r     sR        r r r r r r       r+   r   c            	           e Zd ZdZddededee         f fdZddededee         fdZeddededee         fd	            Z	 e
j                    dde
j        defd            Z	 dde
j        dedee         fdZ xZS )%SpeechT5SinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nnum_positionsembedding_dimpadding_idxc                     t                                                       d| _        || _        || _        |                     || j        z   ||           d S N   )ro   rp   offsetr   r   make_weights)r|   r   r   r   r   s       r)   rp   z.SpeechT5SinusoidalPositionalEmbedding.__init__   sU    *&-$+5}kRRRRRr+   num_embeddingsc                 &   |                      |||          }t          | d          r+|                    | j        j        | j        j                  }t          j        |          | _        d| j        _        | j        	                                 d S )NweightsrH   deviceF)
get_embeddinghasattrtor   rH   r   r   	Parameterrequires_graddetach_)r|   r   r   r   emb_weightss        r)   r   z2SpeechT5SinusoidalPositionalEmbedding.make_weights'  s    ((TT4## 	_%..t|/A$,J].^^K|K00%*"r+   c                    |dz  }t          j        d          |dz
  z  }t          j        t          j        |t          j                                                  | z            }t          j        | t          j                                                                      d          |                    d          z  }t          j        t          j	        |          t          j
        |          gd                              | d          }|dz  dk    r+t          j        |t          j        | d          gd          }|	d||ddf<   |                    t          j                              S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   rG   r   dimr"   N)mathlogtorchexprU   int64float	unsqueezecatsincosviewrR   r   get_default_dtype)r   r   r   half_dimembs        r)   r   z3SpeechT5SinusoidalPositionalEmbedding.get_embedding1  s?    !A%huooA.iXU[AAAGGIISDPQQl>===CCEEOOPQRRUXUbUbcdUeUeei338a@@@EEnVXYY1!!)S%+na"@"@AqIIIC""#CQQQvve-//000r+   r   r   past_key_values_lengthc                    |                                 \  }}|                     || j        |                              |j                  }| j        dz   |z   }|| j                             d          k    r)|                     || j        z   | j        | j                   | j        	                    d|
                    d                    
                    ||d                                          S )Nr   r   r"   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rO   )r|   r   r   bszseq_lenposition_idsmax_poss          r)   r   z-SpeechT5SinusoidalPositionalEmbedding.forwardC  s     ~~''W>>y$JZ\rssvv
 

 "Q&0T\&&q))))g3T5GIYZZZ|((L,=,=b,A,ABBGGWVXYY``bbbr+   c                     |                     |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        r   r   )ner;   r   cumsumtype_aslong)r|   r   r   r   maskincremental_indicess         r)   r   zHSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsR  sg     ||K((,,..$|Da888@@FFI__cgg"''))K77r+   r   r   )r   r   r   __doc__r;   r   rp   r   staticmethodr   r   no_gradTensorr   r   r   r   s   @r)   r   r     sd       NNS Sc S# SHUXM S S S S S S 3 s QYZ]Q^     1 1c 1# 1HUXM 1 1 1 \1" U]__c c cs c c c _c bc8 88478QYZ]Q^8 8 8 8 8 8 8 8r+   r   c                   $     e Zd Z fdZd Z xZS )SpeechT5PositionalConvEmbeddingc                    t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        t          j        j	        }t          t          j        j        d          rt          j        j        j	        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t-          |j                  | _        t0          |j                 | _        d S )	Nr   )rl   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)ro   rp   r   rt   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsrx   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r   ry   rz   )r|   r}   r   r   r   r   r   s         r)   rp   z(SpeechT5PositionalConvEmbedding.__init__e  s   I62a77
 
 
	 h*28,m<< 	@(3?K%'' 	E22493CST2UU I I'K	aHHH	I I I I I I I I I I I I I I Ity"455 .95<F95<F9-9-N66tXFFFN66tXFFFF#DIH!DDDDI+F,JKK !?@s   C??DDc                     |                     dd          }|                     |          }|                     |          }|                     |          }|                     dd          }|S Nr   r   )r   rx   r   rz   r   s     r)   r   z'SpeechT5PositionalConvEmbedding.forward  se    %//155		-00]3366%//155r+   r   r   s   @r)   r   r   d  sM        A A A A AB      r+   r   c                   *     e Zd ZdZd fd	Zd Z xZS ) SpeechT5ScaledPositionalEncodinguS   
    Scaled positional encoding, see §3.2 in https://arxiv.org/abs/1809.08895
      c                 T   t          j        ||          }t          j        d|                              d          }t          j        t          j        d|dt           j                                                  t          j        d          |z   z            }t          j	        |                                |z            |d d dd df<   t          j
        |                                |z            |d d dd df<   |                    d          }t                                                       |                     d|d           t          j        |	          | _        || _        t           j                            t          j        d
                    | _        d S )Nr   r   r   rG   g     @peF)
persistentp      ?)r   rR   rU   r   r   r   r   r   r   r   r   ro   rp   register_bufferr   Dropoutdropoutr   r   tensoralpha)r|   r   r   max_lenr   positiondiv_termr   s          r)   rp   z)SpeechT5ScaledPositionalEncoding.__init__  s^   [#&&<7++55a889el1c1EKHHHNNPPUYU]^eUfUfilUlSmmooi 0 08 ;<<111add7i 0 08 ;<<111add7\\!__T2%888zG,,,X''S(9(9::


r+   c                     || j         | j        d d d |                    d          f         z  z   }|                     |          }|S )Nr   )r   r   r   r   )r|   r   s     r)   r   z(SpeechT5ScaledPositionalEncoding.forward  sG    DJMchhqkkM)9!:::ll3
r+   )r   )r   r   r   r   rp   r   r   r   s   @r)   r   r     sV         ; ; ; ; ; ;      r+   r   c                   &     e Zd Zd fd	Zd Z xZS )"SpeechT5RelativePositionalEncoding  c                     t                                                       || _        || _        t          j                            d|z  |          | _        d S r   )ro   rp   r   
max_lengthr   r   	Embeddingpe_k)r|   r   r  r   s      r)   rp   z+SpeechT5RelativePositionalEncoding.__init__  sH    $H&&q:~s;;			r+   c                 `   |j         d         }t          j        d|                                                              |j                  }|d d d f         |d d d f         z
  }| j         ||| j         k     <   | j        dz
  ||| j        k    <   || j        z   }|                     |          S )Nr   r   )r$   r   rU   r   r   r   r  r	  )r|   r   r   pos_seqs       r)   r   z*SpeechT5RelativePositionalEncoding.forward  s    %a(,q'**//1144]5IJJ!!!T'"WT111W%55/3.>4?**+.2o.A4?*+DO+yy!!!r+   )r  r   r   s   @r)   r  r    sL        < < < < < <	" 	" 	" 	" 	" 	" 	"r+   r  c                   $     e Zd Z fdZd Z xZS )r   c                 l    t                                                       |dz  dk    rdnd| _        d S )Nr   r   r   )ro   rp   num_pad_remove)r|   r   r   s     r)   rp   zSpeechT5SamePadLayer.__init__  s:    #:Q#>!#C#Caar+   c                 J    | j         dk    r|d d d d d | j          f         }|S Nr   )r  r   s     r)   r   zSpeechT5SamePadLayer.forward  s;    "")!!!QQQ0F43F2F0F*FGMr+   r   r   s   @r)   r   r     sL        K K K K K      r+   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )SpeechT5FeatureEncoderz.Construct the features from raw audio waveformc                    t                                                       j        dk    r7t          d          gfdt	          j        dz
            D             z   }nDj        dk    r!fdt	          j                  D             }nt          dj         d	          t          j        |          | _	        d
| _
        d| _        d S )Ngroupr   r~   c                 8    g | ]}t          |d z             S )r   r  )ri   rD   ir}   s     r)   rF   z3SpeechT5FeatureEncoder.__init__.<locals>.<listcomp>  s>     N N NIJ,Va!eDDDN N Nr+   r   layerc                 2    g | ]}t          |           S )r  )r   r  s     r)   rF   z3SpeechT5FeatureEncoder.__init__.<locals>.<listcomp>  s4       CD*6A>>>  r+   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)ro   rp   feat_extract_normr   rQ   num_feat_extract_layersr&   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)r|   r}   r  r   s    ` r)   rp   zSpeechT5FeatureEncoder.__init__  s   #w..5fqIIIJ N N N NNSTZTruvTvNwNwN N N KK %00   HMfNlHmHm  KK t1Ittt   =55&+#"r+   c                 P    |                                  D ]	}d|_        
d| _        d S )NF)
parametersr   r   )r|   params     r)   _freeze_parametersz)SpeechT5FeatureEncoder._freeze_parameters  s4    __&& 	( 	(E"'E#r+   c                     |d d d f         }| j         r| j        rd|_        | j        D ]>}| j         r*| j        r#| j        r|                     |j        |          }3 ||          }?|S )NT)r   trainingr   r  r  _gradient_checkpointing_func__call__)r|   r,   r   
conv_layers       r)   r   zSpeechT5FeatureEncoder.forward  s    $QQQW-  	/4= 	/*.M'* 	: 	:J" :t'B :t} : $ A A'!! !
 !+
= 9 9r+   )r   r   r   r   rp   r$  r   r   r   s   @r)   r  r    s\        88# # # # #&$ $ $
      r+   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeatureProjectionc                 .   t                                                       t          j        |j        d         |j                  | _        t          j        |j        d         |j                  | _	        t          j
        |j                  | _        d S )Nr"   eps)ro   rp   r   r   rq   layer_norm_epsr   Linearr   
projectionr   feat_proj_dropoutr   r|   r}   r   s     r)   rp   z"SpeechT5FeatureProjection.__init__  sn    ,vr':@UVVV)FOB$79KLLz&":;;r+   c                     |                      |          }|                     |          }|                     |          }||fS r   )r   r1  r   )r|   r   norm_hidden_statess      r)   r   z!SpeechT5FeatureProjection.forward  sC    !__];;(:;;]33000r+   r   r   s   @r)   r+  r+    sG        < < < < <1 1 1 1 1 1 1r+   r+  c                       e Zd Z fdZd Z	 	 ddej        deej                 deej	                 fdZ
dedej        fd	Zd
eej        ef         fdZ	 	 ddej	        deej	                 deej                 fdZ xZS )SpeechT5SpeechEncoderPrenetc                    t                                                       || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        t!          |          | _        t%          |j        |j        z   dz   |j        |j                  | _        d S )Nr1   r   )ro   rp   r}   r  feature_encoderr+  feature_projectionmask_time_probmask_feature_probr   r   r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr   pos_sinusoidal_embedr3  s     r)   rp   z$SpeechT5SpeechEncoderPrenet.__init__  s    5f==";F"C"C  3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"=fEE$I'&*==A%
 %
!!!r+   c                 8    | j                                          d S r   )r9  r$  r|   s    r)   freeze_feature_encoderz2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //11111r+   Nr,   r.   mask_time_indicesc                 (   |                      |          }|                    dd          }|!|                     |j        d         |          }|                     |          \  }}|                     |||          }|                     |          }||z   }|(|                    d                                          }n3t          j
        |j        d d         t          j        |j                  }|                     |          }||z   }||fS )Nr   r   )rE  r.   r   )r9  r   "_get_feature_vector_attention_maskr$   r:  _mask_hidden_statesr?  r   r   r   rR   r   rA  )	r|   r,   r.   rE  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r)   r   z#SpeechT5SpeechEncoderPrenet.forward  s/     //==+55a;;%!DD &q) N
 +/*A*ABR*S*S''00->~ 1 
 
 %)$7$7$F$F!%(AA%),,Q//4466LL ;}':2A2'>ejYfYmnnnL+/+D+D\+R+R(%(HHn,,r+   feature_vector_lengthc                    |                     d          d d df         }|                     |                              t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                               d          
                    dg                                          }|S )Nr"   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r$   rR   rH   r   rU   fliprS   )r|   rM  r.   non_padded_lengthsoutput_lengthsr_   s         r)   rG  z>SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask?  s     ,22r2::111b5A>>?QRRUUV[V`aa#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr+   r`   c                 z    d }t          | j        j        | j        j                  D ]\  }} ||||          }|S )zH
        Computes the output length of the convolutional layers
        c                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r   div)r=   rl   rm   s      r)   _conv_out_lengthzVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthT  s&     9\K7wWWWZ[[[r+   )zipr}   ru   rv   )r|   r`   rY  rl   rm   s        r)   rP  z<SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsO  s\    
	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMMr+   r   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        apply_spec_augmentTNr   )r4   r5   r.   r6   )r   rH   )r4   r5   r6   r"   )getattrr}   r   r>  r   rH   r;  r&  rg   mask_time_lengthmask_time_min_masksr   r   r   rS   r<  mask_feature_lengthmask_feature_min_masksexpand)r|   r   rE  r.   r_   r@   r   mask_feature_indicess           r)   rH  z/SpeechT5SpeechEncoderPrenet._mask_hidden_states_  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./r+   NN)r   r   r   rp   rD  r   r   r   
LongTensorFloatTensorr   r;   rG  r   rP  rH  r   r   s   @r)   r7  r7    s6       
 
 
 
 
"2 2 2 6:9=	 -  -l - !!12 - $E$56	 -  -  -  -F ]b]m     eEDTVYDY>Z    & :>59	, ,(, $E$56, !!12	, , , , , , , ,r+   r7  c                   X     e Zd Z fdZd Z	 ddej        deej                 fdZ xZ	S )SpeechT5SpeechDecoderPrenetc                    t                                                       | _        t          j        fdt          j                  D                       | _        t          j        j	        j
                  | _        t          j        j
        j                  | _        t          j        j        j
        z   j
                  | _        d S )Nc                 h    g | ].}t          j        |d k    rj        nj        j                  /S r   )r   r0  num_mel_binsspeech_decoder_prenet_unitsr  s     r)   rF   z8SpeechT5SpeechDecoderPrenet.__init__.<locals>.<listcomp>  sS       
 	 	+,66F''v7Y6   r+   )ro   rp   r}   r   r  rQ   speech_decoder_prenet_layerslayersr0  rl  r   final_layerr   positional_dropoutr@  encode_positionsspeaker_embedding_dimspeaker_embeds_layerr3  s    `r)   rp   z$SpeechT5SpeechDecoderPrenet.__init__  s    m   
 vBCC  
 
 9V%GI[\\ @%'!
 !

 %'If.JVM_._agas$t$t!!!r+   c                     t          j        |d         |          }|                    d                              |                    d          dd          }t          j        |dk    |d          dz  d|z
  z  S )Nr   r   r   )r   	bernoullir   repeatr   where)r|   inputs_embedsr   r   	all_maskss        r)   _consistent_dropoutz/SpeechT5SpeechDecoderPrenet._consistent_dropout  sr    }Q/1555NN1%%,,]-?-?-B-BAqII	{9>=!<<q@AEJJr+   Nr,   speaker_embeddingsc                 R   |}| j         D ]J}t          j                             ||                    }|                     || j        j                  }K|                     |          }|                     |          }|t          j        	                    |          }|
                    d                              d|                    d          d          }t          j        ||gd          }t          j                            |                     |                    }|S )Nr   r"   r   )rn  r   
functionalrelurz  r}   speech_decoder_prenet_dropoutro  rq  	normalizer   rb  r   r   r   rs  )r|   r,   r{  rx  r  s        r)   r   z#SpeechT5SpeechDecoderPrenet.forward  s    %[ 	o 	oEM..uu]/C/CDDM 44]DKDmnnMM((77--m<<)!#!8!89K!L!L!3!=!=a!@!@!G!GML^L^_`LaLace!f!f!I}6H&IrRRRMM..t/H/H/W/WXXMr+   r   )
r   r   r   rp   rz  r   r   r   r   r   r   s   @r)   rh  rh    s        u u u u u,K K K 6: l %U\2       r+   rh  c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5BatchNormConvLayerr   c                    t                                                       |dk    r|j        }n|j        }||j        dz
  k    r|j        }n|j        }t          j        |||j        d|j        dz
  dz  d          | _        t          j	        |          | _
        ||j        dz
  k     rt          j                    | _        nd | _        t          j        |j                  | _        d S )Nr   r   r   F)rl   rm   r   rn   )ro   rp   rk  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rt   speech_decoder_postnet_kernelrx   BatchNorm1d
batch_normTanhrz   r   speech_decoder_postnet_dropoutr   )r|   r}   r~   rr   rs   r   s        r)   rp   z#SpeechT5BatchNormConvLayer.__init__  s    q== -KK =Kv;a???!.LL!>LI<9A=!C
 
 
	 .66f:Q>>> giiDOO"DOz&"GHHr+   c                     |                      |          }|                     |          }| j        |                     |          }|                     |          }|S r   )rx   r  rz   r   r   s     r)   r   z"SpeechT5BatchNormConvLayer.forward  sT    		-0066?& OOM::M]33r+   r   r   r   s   @r)   r  r    sR        I I I I I I<      r+   r  c                   J     e Zd Z fdZdej        fdZdej        fdZ xZS )SpeechT5SpeechDecoderPostnetc                 f   t                                                       | _        t          j        j        j        j        z            | _        t          j        j        j                  | _	        t          j
        fdt          j                  D                       | _        d S )Nc                 0    g | ]}t          |          S rC   )r  r  s     r)   rF   z9SpeechT5SpeechDecoderPostnet.__init__.<locals>.<listcomp>  s$    hhhq'22hhhr+   )ro   rp   r}   r   r0  r   rk  r-   feat_outprob_outr  rQ   r  rn  r3  s    `r)   rp   z%SpeechT5SpeechDecoderPostnet.__init__  s    	&"4f6IFLc6cdd	&"4f6MNNmhhhhE&Bf<g<ghhh
 
r+   r   c                 <   |                      |                              |                    d          d| j        j                  }|                     |          }|                     |                              |                    d          d          }|||fS )Nr   r"   )r  r   r   r}   rk  postnetr  )r|   r   outputs_before_postnetoutputs_after_postnetlogitss        r)   r   z$SpeechT5SpeechDecoderPostnet.forward  s    !%}!=!=!B!B=CUCUVWCXCXZ\^b^i^v!w!w $-C D D}--22=3E3Ea3H3H"MM%'<fDDr+   c                     |                     dd          }| j        D ]} ||          }||                     dd          z   S r   )r   rn  )r|   r   layer_outputr  s       r)   r  z$SpeechT5SpeechDecoderPostnet.postnet  sT    $..q!44[ 	/ 	/E 5..LL|55a;;;;r+   )	r   r   r   rp   r   r   r   r  r   r   s   @r)   r  r    sw        	
 	
 	
 	
 	
EU\ E E E E<U\ < < < < < < < <r+   r  c                   @     e Zd Z fdZd Zd Zdej        fdZ xZ	S )SpeechT5TextEncoderPrenetc                     t                                                       || _        t          j        |j        |j        |j                  | _        t          |j
        |j        |j                  | _        d S r   )ro   rp   r}   r   r  
vocab_sizer   r   embed_tokensr   rp  max_text_positionsrq  r3  s     r)   rp   z"SpeechT5TextEncoderPrenet.__init__  sj    L):F<NPVPcdd @%%!
 !
r+   c                     | j         S r   r  rC  s    r)   get_input_embeddingsz.SpeechT5TextEncoderPrenet.get_input_embeddings        r+   c                     || _         d S r   r  r|   values     r)   set_input_embeddingsz.SpeechT5TextEncoderPrenet.set_input_embeddings      !r+   r   c                 Z    |                      |          }|                     |          }|S r   )r  rq  )r|   r   rx  s      r)   r   z!SpeechT5TextEncoderPrenet.forward  s.    )))44--m<<r+   )
r   r   r   rp   r  r  r   r   r   r   r   s   @r)   r  r    ss        
 
 
 
 
! ! !" " "        r+   r  c            	            e Zd Z fdZd Zd Z	 	 d	dej        deej	                 dee
ej                          fdZ xZS )
SpeechT5TextDecoderPrenetc                    t                                                       || _        t          j        |j                  | _        |j        rt          j	        |j
                  nd| _        t          j        |j        |j
        |j                  | _        t!          |j        |j        z   dz   |j
        |j                  | _        d S )Nr   r   )ro   rp   r}   r   r   rp  r   scale_embeddingr   sqrtr   embed_scaler  r  r   r  r   r  embed_positionsr3  s     r)   rp   z"SpeechT5TextDecoderPrenet.__init__  s    z&";<<<B<R[49V%7888X[L):F<NPVPcddD%(;;a? 
  
r+   c                     | j         S r   r  rC  s    r)   r  z.SpeechT5TextDecoderPrenet.get_input_embeddings)  r  r+   c                     || _         d S r   r  r  s     r)   r  z.SpeechT5TextDecoderPrenet.set_input_embeddings,  r  r+   Nr   r.   past_key_valuesc                 b   |1|                                 }|                    d|d                   }nt          d          ||d         d         j        d         nd}|                     ||          }|                     |          | j        z  }||z  }|                     |          }||fS )Nr"   z'You have to specify `decoder_input_ids`r   r   )r   r   r&   r$   r  r  r  r   )r|   r   r.   r  input_shaper   	positionsrx  s           r)   r   z!SpeechT5TextDecoderPrenet.forward/  s      #..**K!r;r?;;IIFGGGCRC^!3A!6!<Q!?!?de((4JKK	)))44t7GG"]33n,,r+   rd  )r   r   r   rp   r  r  r   r   r   re  r   rf  r   r   r   s   @r)   r  r    s        
 
 
 
 
! ! !" " " 6:=A	- -<- !!12- "$u'8"9:	- - - - - - - -r+   r  c                   @     e Zd Z fdZdej        fdZd Zd Z xZ	S )SpeechT5TextDecoderPostnetc                     t                                                       || _        t          j        |j        |j        d          | _        d S )NFrn   )ro   rp   r}   r   r0  r   r  lm_headr3  s     r)   rp   z#SpeechT5TextDecoderPostnet.__init__F  sB    y!3V5FUSSSr+   r   c                 ,    |                      |          S r   r  r   s     r)   r   z"SpeechT5TextDecoderPostnet.forwardK  s    ||M***r+   c                     | j         S r   r  rC  s    r)   get_output_embeddingsz0SpeechT5TextDecoderPostnet.get_output_embeddingsN  
    |r+   c                     || _         d S r   r  r|   new_embeddingss     r)   set_output_embeddingsz0SpeechT5TextDecoderPostnet.set_output_embeddingsQ  s    %r+   )
r   r   r   rp   r   r   r   r  r  r   r   s   @r)   r  r  E  sx        T T T T T
+U\ + + + +  & & & & & & &r+   r  c                   l    e Zd ZdZ	 	 	 ddedededed	ef
 fd
Zdej	        dedefdZ
	 	 	 	 	 	 ddej	        deej	                 deeej	                          deej	                 deej	                 deej	                 dedeej	        eej	                 eeej	                          f         fdZ xZS )SpeechT5Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    r1   FT	embed_dim	num_headsr   
is_decoderrn   c                    t                                                       || _        || _        || _        ||z  | _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        || _        t          j
        |||          | _        t          j
        |||          | _        t          j
        |||          | _        t          j
        |||          | _        d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )ro   rp   r  r  r   head_dimr&   scalingr  r   r0  k_projv_projq_projout_proj)r|   r  r  r   r  rn   r   s         r)   rp   zSpeechT5Attention.__init__[  s    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*$i	94@@@i	94@@@i	94@@@	)YTBBBr+   r   r   r   c                     |                     ||| j        | j                                      dd                                          S r   )r   r  r  r   
contiguous)r|   r   r   r   s       r)   _shapezSpeechT5Attention._shapev  s<    {{3GGQQRSUVWWbbdddr+   Nr   key_value_statespast_key_valuer.   layer_head_maskposition_biasoutput_attentionsr7   c                    |du}|                                 \  }	}
}|                     |          | j        z  }|r||d         }|d         }n>|rU|                     |                     |          d|	          }|                     |                     |          d|	          }n||                     |                     |          d|	          }|                     |                     |          d|	          }t          j        |d         |gd          }t          j        |d         |gd          }nT|                     |                     |          d|	          }|                     |                     |          d|	          }| j        r||f}|	| j	        z  d| j
        f} |                     ||
|	          j        | } |j        | } |j        | }|                     d          }t          j        ||                    dd                    }|                                 |	| j	        z  |
|fk    r2t          d|	| j	        z  |
|f d|                                            ||                                                    |	| j	        z  d| j
                                      dd          }t          j        ||                    d	d                    }|                    dd                              |	| j	        z  |                     d          |                     d                    }||z  }||                                 |	d|
|fk    r+t          d
|	d|
|f d|                                            |                    |	| j	        |
|          |z   }|                    |	| j	        z  |
|          }t"          j                            |d          }||                                 | j	        fk    r-t          d| j	        f d|                                            |                    dddd          |                    |	| j	        |
|          z  }|                    |	| j	        z  |
|          }|r=|                    |	| j	        |
|          }|                    |	| j	        z  |
|          }nd}t"          j                            || j        | j                  }t          j        ||          }|                                 |	| j	        z  |
| j
        fk    r5t          d|	| j	        |
| j
        f d|                                            |                    |	| j	        |
| j
                  }|                    dd          }|                    |	|
| j                  }|                     |          }|||fS )z#Input shape: Batch x Time x ChannelNr   r   r"   r   r   z$Attention weights should be of size z	, but is r   z!Attention mask should be of size z/Head mask for a single layer should be of size )r   r&  z `attn_output` should be of size )r   r  r  r  r  r  r   r   r  r  r  r   bmmr   r&   r  matmulr   r}  softmaxr   r&  r]   r  r  )r|   r   r  r  r.   r  r  r  is_cross_attentionr   tgt_lenrE   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                          r)   r   zSpeechT5Attention.forwardy  s$    .T9',,..Wa {{=11DL@ 	L."<'*J)!,LL 	LT[[1A%B%BBLLJ;;t{{3C'D'Db#NNLL'T[[%?%?SIIJ;;t{{='A'A2sKKLN1$5z#BJJJJ 9nQ&7%FANNNLL T[[%?%?SIIJ;;t{{='A'A2sKKL? 	8 ),7NDN*B>
Ct{{<#>>CZP$Z_j1
(|(*5//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *   $$//1166sT^7KRQUQ^__iijkmnooI <	=3J3J2r3R3RSSL'11!Q77<<dn$m&8&8&;&;]=O=OPQ=R=R L L(L%""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 "&&sDNGT]SS!++Aq11 "))#wGGmmK001>AAr+   )r1   FT)NNNNNF)r   r   r   r   r;   r   rS   rp   r   r   r  r   r   r   r   r   s   @r)   r  r  U  s          C CC C 	C
 C C C C C C C6eU\ eC ec e e e e 488<152604"'yB yB|yB #5<0yB !u|!45	yB
 !.yB "%,/yB  -yB  yB 
u|Xel3XeEL>Q5RR	SyB yB yB yB yB yB yB yBr+   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeedForwardc                    t                                                       t          j        |j                  | _        t          j        |j        |          | _        t          |j
        t                    rt          |j
                 | _        n|j
        | _        t          j        ||j                  | _        t          j        |j                  | _        d S r   )ro   rp   r   r   activation_dropoutintermediate_dropoutr0  r   intermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)r|   r}   intermediate_sizer   s      r)   rp   zSpeechT5FeedForward.__init__  s    $&Jv/H$I$I!"$)F,>@Q"R"Rf'-- 	9'-f.?'@D$$'-'8D$I&79KLL j)>??r+   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )r  r  r  r  r  r   s     r)   r   zSpeechT5FeedForward.forward  sg    //>>00??11-@@))-88++M::r+   r   r   s   @r)   r  r    sL        @ @ @ @ @      r+   r  c                        e Zd Zdef fdZ	 	 	 	 ddej        deej                 deej                 deej                 d	ef
d
Z	 xZ
S )SpeechT5EncoderLayerr}   c                    t                                                       t          |j        |j        |j        d          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          ||j                  | _        t          j        |j        |j                  | _        d S )NFr  r  r   r  r-  )ro   rp   r  r   encoder_attention_headsattention_dropout	attentionr   r   r  r   r   r/  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normr3  s     r)   rp   zSpeechT5EncoderLayer.__init__  s    *(4,	
 
 
 z&"788,v'9v?TUUU/8NOO "V-?VEZ [ [ [r+   NFr   r.   r  r  r  c                    |}|                      |||||          \  }}}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }|f}	|r|	|fz  }	|	S )as  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(config.encoder_attention_heads,)`.
            position_bias (`torch.FloatTensor`):
                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r.   r  r  r  )r  r   r   r   r  )
r|   r   r.   r  r  r  residualr  rE   outputss
             r)   r   zSpeechT5EncoderLayer.forward  s    . !)-')+'/ *8 *
 *
&|Q ]33 =066%(9(9-(H(HH--m<< " 	'&Gr+   )NNNF)r   r   r   r   rp   r   r   r   rS   r   r   r   s   @r)   r  r    s        \~ \ \ \ \ \ \  262604"', ,|, !., "%,/	,
  -,  , , , , , , , ,r+   r  c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 ddej        deej                 deej                 d	eej                 d
eej                 deej                 deeej                          dee	         dee	         fdZ
 xZS )SpeechT5DecoderLayerr}   c                 2   t                                                       t          |j        |j        |j        d          | _        t          j        |j	                  | _
        t          j        |j        |j                  | _        t          |j        |j        |j        d          | _        t          j        |j        |j                  | _        t!          ||j                  | _        t          j        |j        |j                  | _        d S )NTr  r-  )r   r  )ro   rp   r  r   decoder_attention_headsr  	self_attnr   r   r  r   r   r/  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr   r  r3  s     r)   rp   zSpeechT5DecoderLayer.__init__K  s    *(4,	
 
 
 z&"788$&L1CI^$_$_$_!-*,	
 
 
 (*|F4FFLa'b'b'b$/8NOO "V-?VEZ [ [ [r+   NFTr   r.   encoder_hidden_statesencoder_attention_maskr  cross_attn_layer_head_maskr  r  	use_cachec
                    |}
|
|dd         nd}|                      |||||          \  }}}|                     |          }|
|z   }|                     |          }d}d}|c|}
|
|dd         nd}|                     ||||||          \  }}}|                     |          }|
|z   }|                     |          }||z   }||                     |          z   }|                     |          }|f}|r|||fz  }|	r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr   )r   r  r.   r  r  r   )r   r  r.   r  r  r  )r	  r   r
  r  r  r   r  )r|   r   r.   r  r  r  r  r  r  r  r  self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer  s                     r)   r   zSpeechT5DecoderLayer.forwarda  s   < ! :H9S>"1"#5#5Y] >Bnn'3)+/ ?M ?
 ?
;(*; ]33 =011-@@ (,$! ,$H @N?Yrss(;(;_c%NRN_N_+!65 :8"3 O` O OKM-/K !LL77M$}4M 88GGM !24P P &(9(9-(H(HH--m<< " 	?)+=>>G 	,)++Gr+   )NNNNNNFT)r   r   r   r   rp   r   r   r   r   rS   r   r   r   s   @r)   r  r  J  s!       \~ \ \ \ \ \ \2 268<9=26=A8<,1$(R R|R !.R  (5	R
 !) 6R "%,/R %-U\$:R !u|!45R $D>R D>R R R R R R R Rr+   r  c                   (    e Zd ZdZeZdZdZdZd Z	dS )SpeechT5PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    speecht5r,   Tc           
      F   t          |t                    rt          j                            |j        j        ddt          j        d|j        j	        d         |j        j
        z  z            z             t          j                            |j        j        d           dS t          |t                    r}t          j        d|j        j        z            }t          j                            |j        j        | |           t          j                            |j        j        | |           dS t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j                                         dS dS t          |t          j        t          j        f          r?|j        j                                         |j        j                            d           dS t          |t          j                  rt          j                            |j                   |j        Yt          j        |j        |j
        |j	        d         z  z            }t          j                            |j        | |           dS dS t          |t          j                  r]|j        j                            d| j        j                   |j        -|j        j        |j                                                  dS dS dS )	zInitialize the weightsr   r   r   meanstd)abr1   Nr   )r  r   r   initnormal_rx   r   r   r  rl   in_channels	constant_rn   r+  r1  in_featuresr=  r0  datar}   initializer_rangezero_r   r   fill_rt   kaiming_normal_r   r  r   )r|   moduleks      r)   _init_weightsz%SpeechT5PreTrainedModel._init_weights  s   f=>> 	?GOO"	!v{'>q'AFKD['["\]]]    
 Gfk.22222 9:: 	?	!f/;;<<AGV.5!qAAAGV.3rQ?????	** 	?M&&CT[5R&SSS{& &&((((( '&r| <== 	?K""$$$M$$S)))))	** 	?G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888 '& -- 	?M&&CT[5R&SSS!-"6#56<<>>>>>	? 	?--r+   N)
r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr.  rC   r+   r)   r  r    sE         
 "L"$O&*#? ? ? ? ?r+   r  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej        deej	                 deej	                 dee
         d	ee
         d
ee
         deeef         fdZ xZS )SpeechT5Encoderzu
    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
    r}   c                    t                                                     t          j        j        j                  | _        t          j        j                  | _	        j
        | _        t          j        fdt          j                  D                       | _        t!          j        j        z  j                  | _        d| _        |                                  d S )Nr-  c                 .    g | ]}t                    S rC   )r  rD   rE   r}   s     r)   rF   z,SpeechT5Encoder.__init__.<locals>.<listcomp>  "    $h$h$ha%9&%A%A$h$h$hr+   F)ro   rp   r   r   r   r/  r   r   r  r   encoder_layerdrop	layerdropr  rQ   encoder_layersrn  r  r  encoder_max_relative_positionr  r  	post_initr3  s    `r)   rp   zSpeechT5Encoder.__init__  s       ,v'9v?TUUUz&"7881m$h$h$h$h5QWQfKgKg$h$h$hiiA&"@@&Bf 
  
 ',# 	r+   Nr   r.   	head_maskr  output_hidden_statesreturn_dictr7   c           	      (   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          ||j                  }|                     |          }|                     |          }|                     |          }t                      pt          |           }|rdnd}	|rdnd}
|p|                                d         t          | j                  k    r@t          dt          | j                   d|                                d          d          t          | j                  D ]\  }}|r|	|fz   }	d}| j        rt#          j        g           }|| j        k     }|r|rZ| j        r1| j        r*|                     |j        |||||         nd||          }n |||||||         nd|          }|d         }|rd	}|r|
|d
         fz   }
|r|	|fz   }	|st/          d ||	|
fD                       S t1          ||	|
          S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the encoder prenet.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrC   r   z&The head_mask should be specified for  layers, but it is for .F)r.   r  r  r  rd  r   c              3      K   | ]}||V  	d S r   rC   rD   vs     r)   	<genexpr>z*SpeechT5Encoder.forward.<locals>.<genexpr>`  s(      mmq_`_l_l_l_l_lmmr+   last_hidden_stater   
attentions)r}   r  r?  use_return_dictr   rH   r   r   r  r   r   r   rV   rn  r&   	enumerater&  r   rL   r:  r  r'  r(  tupler   )r|   r   r.   r>  r  r?  r@  r  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r)   r   zSpeechT5Encoder.forward  s   H 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] %7H[\\N66]33,,];;022R6LT6R6R"6@BBD$5?bb4  ~~"c$+&6&666 /S=M=M / /!((+/ / /  
 #,DK"8"8 #	P #	PC# I$58H$H! #N} F&+jnn#!4t~!E! 1[ 1. 4= $($E$E%.%&+4+@3d%)% %MM %2M%'5&3;D;P3VZ*;% % %M !.a 0 - ,  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r+   NNNNNr   r   r   r   r   rp   r   rf  r   r   rS   r   r   r   r   r   r   s   @r)   r4  r4    s         ~      ( 26,0,0/3&*p
 p
(p
 !.p
 EL)	p

 $D>p
 'tnp
 d^p
 
uo%	&p
 p
 p
 p
 p
 p
 p
 p
r+   r4  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej        deej	                 deej	                 dee
         d	ee
         d
ee
         deeef         fdZ xZS )SpeechT5EncoderWithSpeechPrenetz
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    r}   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )ro   rp   r7  prenetr4  wrapped_encoderr=  r3  s     r)   rp   z(SpeechT5EncoderWithSpeechPrenet.__init__o  R       1&99.v66 	r+   Nr,   r.   r>  r  r?  r@  r7   c                 n    |                      ||          \  }}|                     ||||||          }|S N)r   r.   r>  r  r?  r@  r[  r\  	r|   r,   r.   r>  r  r?  r@  r   r  s	            r)   r   z'SpeechT5EncoderWithSpeechPrenet.forwardw  sO     )-L.(Q(Q%~&&')/!5# ' 
 
 r+   rV  rW  r   s   @r)   rY  rY  i  s         
~       26,0,0/3&* ' !. EL)	
 $D> 'tn d^ 
uo%	&       r+   rY  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 	 ddej	        de
ej                 d	e
ej                 d
e
e         de
e         de
e         deeef         fdZ xZS )SpeechT5EncoderWithTextPrenetz|
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    r}   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )ro   rp   r  r[  r4  r\  r=  r3  s     r)   rp   z&SpeechT5EncoderWithTextPrenet.__init__  R       /77.v66 	r+   c                 4    | j                                         S r   r[  r  rC  s    r)   r  z2SpeechT5EncoderWithTextPrenet.get_input_embeddings      {//111r+   c                 :    | j                             |           d S r   r[  r  r  s     r)   r  z2SpeechT5EncoderWithTextPrenet.set_input_embeddings      ((/////r+   Nr,   r.   r>  r  r?  r@  r7   c                 f    |                      |          }|                     ||||||          }|S r_  r`  ra  s	            r)   r   z%SpeechT5EncoderWithTextPrenet.forward  sH     L11&&')/!5# ' 
 
 r+   rV  )r   r   r   r   r   rp   r  r  r   rf  r   r   rS   r   r   r   r   r   r   s   @r)   rc  rc    s        ~      2 2 20 0 0 26,0,0/3&* ' !. EL)	
 $D> 'tn d^ 
uo%	&       r+   rc  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej        deej	                 deej	                 dee
         d	ee
         d
ee
         deeef         fdZ xZS )SpeechT5EncoderWithoutPrenet
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    r}   c                     t                                          |           t          |          | _        |                                  d S r   )ro   rp   r4  r\  r=  r3  s     r)   rp   z%SpeechT5EncoderWithoutPrenet.__init__  C       .v66 	r+   Nr,   r.   r>  r  r?  r@  r7   c                 8    |                      ||||||          S r_  )r\  )r|   r,   r.   r>  r  r?  r@  s          r)   r   z$SpeechT5EncoderWithoutPrenet.forward  s3     ##&)/!5# $ 
 
 	
r+   rV  rW  r   s   @r)   rn  rn    s         
~       26,0,0/3&*
 
'
 !.
 EL)	

 $D>
 'tn
 d^
 
uo%	&
 
 
 
 
 
 
 
r+   rn  c                   ^    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej	                 d	eej
                 d
eej
                 deeej                          dee         dee         dee         dee         deeef         fdZ xZS )SpeechT5Decoderzt
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
    r}   c                    t                                                     j        | _        t	          j        fdt          j                  D                       | _        d| _	        | 
                                 d S )Nc                 .    g | ]}t                    S rC   )r  r7  s     r)   rF   z,SpeechT5Decoder.__init__.<locals>.<listcomp>  r8  r+   F)ro   rp   decoder_layerdropr:  r   r  rQ   decoder_layersrn  r  r=  r3  s    `r)   rp   zSpeechT5Decoder.__init__  sx       1m$h$h$h$h5QWQfKgKg$h$h$hii&+# 	r+   Nr   r.   r  r  r>  cross_attn_head_maskr  r  r  r?  r@  r7   c                    |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }|                                dd         }||d         d         j        d         nd}t          ||||          }||t          ||j	        |d                   }t                      pt          |           }| j        r%| j        r|rt                              d           d}|
rdnd}|	rdnd}|	r|dnd}|rdnd}t!          ||gd	d
g          D ]z\  }}|s|                                d         t#          | j                  k    rCt'          d| dt#          | j                   d|                                d          d          {t)          | j                  D ]\  }}|
r||fz   }d}| j        rt+          j        g           }|| j        k     }|r|s:|||         nd}| j        r?| j        r8|                     |j        |||||||         nd|||         ndd|	|
  
        }n( ||||||||         nd|||         nd||	|	  	        }|d         }|r|||	rdnd         fz  }|	r||d         fz   }|||d         fz   }|
r||fz   }|r|nd}|st5          d |||||fD                       S t7          |||||          S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the decoder prenet.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr"   r   r   )r  zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FrC   r>  ry  zThe `z` should be specified for rB  rC  )r.   r  r  r  r  r  r  r  r   r   c              3      K   | ]}||V  	d S r   rC   rE  s     r)   rG  z*SpeechT5Decoder.forward.<locals>.<genexpr>  s0        =  === r+   )rI  r  r   rJ  cross_attentions)r}   r  r?  r  rK  r   r$   r   r   rH   r   r   r  r&  loggerwarning_oncerZ  rV   rn  r&   rL  r   rL   r:  r'  r(  rM  r   )r|   r   r.   r  r  r>  ry  r  r  r  r?  r@  r  r   rN  rO  rP  all_cross_attentionsnext_decoder_cache	attn_mask	mask_namerQ  decoder_layerrS  rT  r  rU  
next_caches                               r)   r   zSpeechT5Decoder.forward  sz   R 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B]#((**3B3/CRC^!3A!6!<Q!?!?de:K8N
 

 !,1G1S%?&(;[QS_& & &" 122R6LT6R6R& 	"4= 	" "##p   "	 #7@BBD$5?bb4&7h<Q<]rrdh#,6RR$ %(4H(IKYoKp$q$q 	 	 Iy$>>##A&3t{+;+;<<$3	 3 3SEUEU 3 3%NN,,Q/3 3 3  
 #,DK"8"8 2	V 2	VC# I$58H$H! #N} F&+jnn#!4t~!E k 5D5P_S11VZN* t}  $ A A!*!")*&/&;IcNN1E1Q(--W[%! ! !.!#1*?+A7@7LYs^^RV5I5U,S11[_#1&7'! ! ! *!,M V"}:K5RQQQR'S&UU"  V&9]1=M<O&O#(4+?=QRCSBU+U( 	E 1]4D D+4>''$
 	  '5FH[]qr      9+&+*1
 
 
 	
r+   NNNNNNNNNNNr   r   r   r   r   rp   r   r   rf  re  r   r   rS   r   r   r   r   r   r   s   @r)   rt  rt    sk        	~ 	 	 	 	 	 	 6:59=A=A,07;=A$(,0/3&*|
 |
 12|
 !!12|
  ((9:	|

 !))9 :|
 EL)|
 'u|4|
 "$u'8"9:|
 D>|
 $D>|
 'tn|
 d^|
 
u??	@|
 |
 |
 |
 |
 |
 |
 |
r+   rt  c                   z    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej	                 d	eej
                 d
eej
                 deej
                 deeej                          dee         dee         dee         dee         deeef         fdZ xZS )SpeechT5DecoderWithSpeechPrenetz
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    r}   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )ro   rp   rh  r[  rt  wrapped_decoderr=  r3  s     r)   rp   z(SpeechT5DecoderWithSpeechPrenet.__init__  r]  r+   Nr,   r.   r  r  r{  r>  ry  r  r  r  r?  r@  r7   c                 r    |                      ||          }|                     ||||||||	|
||          }|S N)r   r.   r  r  r>  ry  r  r  r  r?  r@  r[  r  )r|   r,   r.   r  r  r{  r>  ry  r  r  r  r?  r@  decoder_hidden_statesr  s                  r)   r   z'SpeechT5DecoderWithSpeechPrenet.forward  s[     !%L:L M M&&/)"7#9!5+/!5# ' 
 
 r+   )NNNNNNNNNNNNr  r   s   @r)   r  r    sj        
~       5959=A=A59,07;=A$(,0/3&* u01 !!12  ((9:	
 !))9 : %U\2 EL) 'u|4 "$u'8"9: D> $D> 'tn d^ 
u??	@       r+   r  c                   j    e Zd ZdZdef fdZd Zd Z	 	 	 	 	 	 	 	 	 	 	 ddee	j
                 dee	j                 d	ee	j
                 d
ee	j                 dee	j                 dee	j                 deee	j
                          dee         dee         dee         dee         deeef         fdZ xZS )SpeechT5DecoderWithTextPrenetz{
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    r}   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S r   )ro   rp   r  r[  rt  r  r=  r3  s     r)   rp   z&SpeechT5DecoderWithTextPrenet.__init__  re  r+   c                 4    | j                                         S r   rg  rC  s    r)   r  z2SpeechT5DecoderWithTextPrenet.get_input_embeddings  rh  r+   c                 :    | j                             |           d S r   rj  r  s     r)   r  z2SpeechT5DecoderWithTextPrenet.set_input_embeddings  rk  r+   Nr,   r.   r  r  r>  ry  r  r  r  r?  r@  r7   c                 z    |                      |||          \  }}|                     |||||||||	|
|          }|S r  r  )r|   r,   r.   r  r  r>  ry  r  r  r  r?  r@  r  r  s                 r)   r   z%SpeechT5DecoderWithTextPrenet.forward  sb     15L.Zi0j0j-~&&/)"7#9!5+/!5# ' 
 
 r+   r  )r   r   r   r   r   rp   r  r  r   r   rf  re  r   r   rS   r   r   r   r   r   r   s   @r)   r  r    st        ~      2 2 20 0 0
 5959=A=A,07;=A$(,0/3&* u01 !!12  ((9:	
 !))9 : EL) 'u|4 "$u'8"9: D> $D> 'tn d^ 
u??	@       r+   r  c                   ^    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej	                 deej                 deej	                 d	eej
                 d
eej
                 deeej                          dee         dee         dee         dee         deeef         fdZ xZS )SpeechT5DecoderWithoutPrenetro  r}   c                     t                                          |           t          |          | _        |                                  d S r   )ro   rp   rt  r  r=  r3  s     r)   rp   z%SpeechT5DecoderWithoutPrenet.__init__  rq  r+   Nr,   r.   r  r  r>  ry  r  r  r  r?  r@  r7   c                 F    |                      |||||||||	|
|          }|S r  )r  )r|   r,   r.   r  r  r>  ry  r  r  r  r?  r@  r  s                r)   r   z$SpeechT5DecoderWithoutPrenet.forward  sF     &&&)"7#9!5+/!5# ' 
 
 r+   r  r  r   s   @r)   r  r    sV        
~       5959=A=A,07;=A$(,0/3&* u01 !!12  ((9:	
 !))9 : EL) 'u|4 "$u'8"9: D> $D> 'tn d^ 
u??	@       r+   r  c                        e Zd ZdZdef fdZdej        dej        dej        dej	        fdZ
d	 Zed
             Z xZS )$SpeechT5GuidedMultiheadAttentionLossz
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://arxiv.org/abs/1710.08969), adapted for multi-head attention.
    r}   c                 x    t                                                       |j        | _        |j        | _        d S r   )ro   rp   guided_attention_loss_sigmasigmaguided_attention_loss_scalescaler3  s     r)   rp   z-SpeechT5GuidedMultiheadAttentionLoss.__init__<  s1    7
7


r+   rJ  input_masksoutput_masksr7   c                 V   |                      |||j                  }|                    d          |                    d          z  }|                    |j                                      d          }||z  }t	          j        |                    |                    }| j        |z  S )aY  
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        r"   r   r   )_make_guided_attention_masksr   r   r   r   r  masked_selectr  )r|   rJ  r  r  guided_attn_masksmaskslosseslosss           r)   r   z,SpeechT5GuidedMultiheadAttentionLoss.forwardA  s    " !==k<YcYjkk&&r**[-B-B2-F-FF*++55a88"Z/z&..u5566zD  r+   c                    |                     d          }|                     d          }t          j        t          |          |j        d         |j        d         f|          }t          t          ||                    D ]/\  }\  }}	|                     ||	| j        |          ||d |	d |f<   0|	                    d          S )Nr"   r   rO  )
rN   r   rR   rV   r$   rL  rZ  _make_guided_attention_maskr  r   )
r|   r  r  r   r`   rS  r  rQ  ilenolens
             r)   r  zASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksZ  s    #++%))"--!K[)9)9<;Ma;PR]RcdeRf(gpvwww!*3}n+M+M!N!N 	t 	tC$373S3STXZ^`d`jlr3s3sc5D5%4%/00 **1---r+   c                 0   t          j        t          j        | |          t          j        ||          d          \  }}|                                |z  }|                                | z  }dt          j        ||z
  dz   d|dz  z  z            z
  S )NrO  xy)indexingr   r   )r   meshgridrU   r   r   )r=   output_lengthr  r   grid_ygrid_xs         r)   r  z@SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_maske  s    Lf555Lv666
 
 

 -/,.UY&6/a!78ANKLLLLr+   )r   r   r   r   r   rp   r   rf  
BoolTensorr   r   r  r   r  r   r   s   @r)   r  r  6  s         
8~ 8 8 8 8 8 8
!+!:?:J!Z_Zj!	! ! ! !2	. 	. 	. M M \M M M M Mr+   r  c                        e Zd ZdZdef fdZ	 ddej        dej        dej        dej        d	ej        d
e	ej                 dej
        fdZ xZS )SpeechT5SpectrogramLossz;
    Loss computation used by SpeechT5ForTextToSpeech.
    r}   c                 >   t                                                       |j        | _        |j        | _        |j        | _        t                      | _        t          t          j	        d                    | _
        | j        rt          |          | _        d S d S )Ng      @)
pos_weight)ro   rp   use_guided_attention_lossguided_attention_loss_num_headsr-   r
   l1_criterionr   r   r   bce_criterionr  attn_criterionr3  s     r)   rp   z SpeechT5SpectrogramLoss.__init__v  s    )/)I&/5/U, & 7"HH.%,s:K:KLLL) 	O"Fv"N"ND	O 	Or+   Nr.   r  r  r  labelsr|  r7   c                 b    |dk    }|                     |          }|                     |          }|                     |          }                     ||                               ||          z   }|d d d d df         }	t          j        |	 dz  t          j        |	                    d          d                              |	j                  gd          }
|
d d dd f                              |	          }
|                     |	          }                     ||
          }||z   } j	        rzt          j         fd|D             d          }|dk    }|d d d d df         } j
        dk    r|d d  j
        dz
  d  j
        f         }                     |||          }||z  }|S )Nr0   r   r   r   r   c                 6    g | ]}|d d d j         f         S r   )r  )rD   xr|   s     r)   rF   z3SpeechT5SpectrogramLoss.forward.<locals>.<listcomp>  s0    eeeqa#IT%I#I IJeeer+   )r  r  r   r   rX   r   r   r   r  r  r-   r  )r|   r.   r  r  r  r  r|  rK  l1_lossr  stop_labelsbce_lossr  attnr  r  	attn_losss   `                r)   r   zSpeechT5SpectrogramLoss.forward  s    ' %%l33!7!E!El!S!S 5 C CL Q Q ##$96BBTEVEVWmouEvEvv QQQ1W%i%#uz%**Q--/K/K/N/Nu|/\/\ ]cdeee!!!!QRR%(66u==%%e,, %%fk:: ! ) 	9eeeeTdeeeklmmmD(A-K'111a0L$q((+AAAt/Dq/H/aDLa/a,ab++D+|LLIIDr+   r   )r   r   r   r   r   rp   r   re  rf  r   r   r   r   r   s   @r)   r  r  q  s         
O~ 
O 
O 
O 
O 
O 
O& 9=) )() !& 1)  %0	)
 !) !) #5#45) 
) ) ) ) ) ) ) )r+   r  a  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`SpeechT5Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
        encoder ([`SpeechT5EncoderWithSpeechPrenet`] or [`SpeechT5EncoderWithTextPrenet`] or `None`):
            The Transformer encoder module that applies the appropiate speech or text encoder prenet. If `None`,
            [`SpeechT5EncoderWithoutPrenet`] will be used and the `input_values` are assumed to be hidden states.
        decoder ([`SpeechT5DecoderWithSpeechPrenet`] or [`SpeechT5DecoderWithTextPrenet`] or `None`):
            The Transformer decoder module that applies the appropiate speech or text decoder prenet. If `None`,
            [`SpeechT5DecoderWithoutPrenet`] will be used and the `decoder_input_values` are assumed to be hidden
            states.
aM  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`SpeechT5Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            <Tip warning={true}>

            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
            **not** be passed to avoid degraded performance when doing batched inference. For such models
            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
            models also yield slightly different results depending on whether `input_values` is padded or not.

            </Tip>

        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

        head_mask (`torch.FloatTensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`torch.FloatTensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.

        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_values` (those
            that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_values` of shape `(batch_size, sequence_length)`. decoder_inputs_embeds (`torch.FloatTensor`
            of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
            `decoder_input_values` you can choose to directly pass an embedded representation. If `past_key_values` is
            used, optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is
            useful if you want more control over how to convert `decoder_input_values` indices into associated vectors
            than the model's internal embedding lookup matrix.

        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zlThe bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.c            #           e Zd Z	 	 ddedeej                 deej                 f fdZd Zd Z	d Z
d	 Zd
 Z ee           eee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   deeeej                                   dee         deej                 dee         dee         dee         deeej                 ef         fd                        Z xZS )SpeechT5ModelNr}   encoderdecoderc                     t                                          |           || _        |t          |          n|| _        |t          |          n|| _        |                                  d S r   )ro   rp   r}   rn  r  r  r  r=  )r|   r}   r  r  r   s       r)   rp   zSpeechT5Model.__init__2  sp     	   ?F3F;;;T[?F3F;;;T[ 	r+   c                     t          | j        t                    r| j                                        S t          | j        t
                    r| j                                        S d S r   )r  r  rc  r  r  r  rC  s    r)   r  z"SpeechT5Model.get_input_embeddings@  sY    dl$ABB 	7<44666dl$ABB 	7<44666tr+   c                     t          | j        t                    r| j                            |           t          | j        t
                    r| j                            |           d S d S r   )r  r  rc  r  r  r  r  s     r)   r  z"SpeechT5Model.set_input_embeddingsG  sh    dl$ABB 	5L--e444dl$ABB 	5L--e44444	5 	5r+   c                     | j         S r   )r  rC  s    r)   get_encoderzSpeechT5Model.get_encoderM  r  r+   c                     | j         S r   )r  rC  s    r)   get_decoderzSpeechT5Model.get_decoderP  r  r+   c                 z    t          | j        t                    r | j        j                                         dS dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  rY  r[  rD  rC  s    r)   rD  z$SpeechT5Model.freeze_feature_encoderS  s@    
 dl$CDD 	9L6688888	9 	9r+   output_typer/  r,   r.   decoder_input_valuesdecoder_attention_maskr>  decoder_head_maskry  encoder_outputsr  r  r{  r  r?  r@  r7   c                 4   ||n| j         j        }||n| j         j        }|
|
n| j         j        }
||n| j         j        }||                     ||||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|Lt          | j        t                    r2| j        j
                            |d         j        d         |          }n|}t          | j        t                    rd|i}ni } | j        d
|||d         ||||	|
|||d|}|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j        	          S )au  
        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.

        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
            the vocabulary, or hidden states.

        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.

        Returns:
        N)r,   r.   r>  r  r?  r@  r   r   r   rH  r{  )r,   r.   r  r  r>  ry  r  r  r  r?  r@  )rI  r  r  decoder_attentionsr|  encoder_last_hidden_stater  encoder_attentionsrC   )r}   r  r?  r  rK  r  r  r   rV   rY  r[  rG  r$   r  r  r   rI  r  r   rJ  r|  )r|   r,   r.   r  r  r>  r  ry  r  r  r  r{  r  r?  r@  r  decoder_argsdecoder_outputss                     r)   r   zSpeechT5Model.forward[  s*   D 2C1N--TXT_Tq$8$D  $+Jj 	 "+!6IIDK<Q	%0%<kk$+B] ""ll)-#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O %*T\Cb*c*c%%)\%8%[%["(+^& &"" &4"dl$CDD 	02DELLL&$, 
-1"1!"4#9'!5+/!5#
 
 
 
  	5"_44!-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r+   rd  NNNNNNNNNNNNNN)r   r   r   r   r   r   Modulerp   r  r  r  r  rD  r   SPEECHT5_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r   re  rf  r   rS   r   r   r   r   s   @r)   r  r  -  sk        (,'+	  ")$ ")$	       5 5 5    9 9 9 +*+DEE+=O\\\ 04597;=A159=7;EIEI$(:>,0/3&*a
 a
u|,a
 !!12a
 'u|4	a

 !))9 :a
 E-.a
 $E$56a
 'u|4a
 "%e.?(@"ABa
 "%e.?(@"ABa
 D>a
 %U%67a
 $D>a
 'tna
 d^a
  
uU&');;	<!a
 a
 a
 ]\ FEa
 a
 a
 a
 a
r+   r  z8SpeechT5 Model with a speech encoder and a text decoder.c            #       j    e Zd ZdgZdef fdZd Zd Zd Zd Z	d Z
 ee           eee	          	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   deeeej                                   dee         dee         dee         dee         deej                 deeef         fd                        Z	 	 	 	 	 	 	 ddZed             Z xZS )SpeechT5ForSpeechToTextz#text_decoder_postnet.lm_head.weightr}   c                 @   t                                          |           |j        t          d| j         d          t          |          }t          |          }t          |||          | _        t          |          | _
        |                                  d S )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)ro   rp   r  r&   r   rY  r  r  r  r  text_decoder_postnetr=  )r|   r}   speech_encodertext_decoderr   s       r)   rp   z SpeechT5ForSpeechToText.__init__  s       $/ / / /   9@@4V<<%fnlKK$>v$F$F! 	r+   c                 4    | j                                         S r   r  r  rC  s    r)   r  z#SpeechT5ForSpeechToText.get_encoder      }((***r+   c                 4    | j                                         S r   r  r  rC  s    r)   r  z#SpeechT5ForSpeechToText.get_decoder  r  r+   c                 \    |                                  j                                         dS r  r  r[  rD  rC  s    r)   rD  z.SpeechT5ForSpeechToText.freeze_feature_encoder  +    
 	!88:::::r+   c                 4    | j                                         S r   )r  r  rC  s    r)   r  z-SpeechT5ForSpeechToText.get_output_embeddings  s    (>>@@@r+   c                 :    | j                             |           d S r   )r  r  r  s     r)   r  z-SpeechT5ForSpeechToText.set_output_embeddings  s    !77GGGGGr+   r  Nr,   r.   decoder_input_idsr  r>  r  ry  r  r  r  r  r?  r@  r  r7   c                 6   ||n| j         j        }|'|%t          || j         j        | j         j                  }|                     |||||||||	|
||d          }|                     |d                   }d}|Kt                      } ||                    d| j         j	                  |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j        |j        |j        |j        |j        	  	        S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.

        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

        Returns:

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
        >>> predicted_ids = model.generate(**inputs, max_length=100)

        >>> # transcribe speech
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        >>> transcription[0]
        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
        ```

        ```python
        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

        >>> # compute loss
        >>> loss = model(**inputs).loss
        >>> round(loss.item(), 2)
        19.68
        ```
        NT)r,   r.   r  r  r>  r  ry  r  r  r  r  r?  r@  r   r"   r   )	r  r  r  r  r  r|  r  r  r  )r}   rK  r*   r   r    r  r  r	   r   r  r   r  r  r  r|  r  r  r  )r|   r,   r.   r  r  r>  r  ry  r  r  r  r  r?  r@  r  r  r  r  loss_fctoutputs                       r)   r   zSpeechT5ForSpeechToText.forward  s_   d &1%<kk$+B] ($6DK4dk6X% %! --%)!2#9/!5++/!5   
 
  **71:66'))H8FKKDK,BCCV[[QS__UUD 	FY,F)-)9TGf$$vE#3")"?&9$5&-&G")"?&9

 

 

 
	
r+   c	           	          |K|d         d         j         d         }
|j         d         |
k    r|
}n|j         d         dz
  }|d d |d f         }||||||||dS )Nr   r   r   )r  r  r  r.   r>  r  ry  r  )r$   )r|   r  r  r.   r>  r  ry  r  r  kwargspast_lengthremove_prefix_lengths               r)   prepare_inputs_for_generationz5SpeechT5ForSpeechToText.prepare_inputs_for_generationp	  s     &)!,Q/5a8K !&q)K77'2$$ (9'>q'AA'E$ 1!!!5I5J5J2J K  /.!2,"!2$8"	
 	
 		
r+   c                 T    d}| D ]!}|t          fd|D                       fz  }"|S )NrC   c              3   t   K   | ]2}|                     d                     |j                            V  3dS )r   N)r   r   r   )rD   
past_statebeam_idxs     r)   rG  z9SpeechT5ForSpeechToText._reorder_cache.<locals>.<genexpr>	  sC      nnU_j--aZ=N1O1OPPnnnnnnr+   )rM  )r  r  reordered_past
layer_pasts    `  r)   _reorder_cachez&SpeechT5ForSpeechToText._reorder_cache	  sQ    ) 	 	Jnnnncmnnnnn NN r+   r  )NNNNNNN)r   r   r   _tied_weights_keysr   rp   r  r  rD  r  r  r   r  r   r   r  r   r   rf  re  r   r   rS   r   r   r  r   r  r   r   s   @r)   r  r    s       
 @@~      (+ + ++ + +; ; ;A A AH H H +*+DEE?YYY 59598<=A159=7;EIEI$(,0/3&*-1}
 }
u01}
 !!12}
 $E$45	}

 !))9 :}
 E-.}
 $E$56}
 'u|4}
 "%e.?(@"AB}
 "%e.?(@"AB}
 D>}
 $D>}
 'tn}
 d^}
 )*}
  
uo%	&!}
 }
 }
 ZY FE}
D !$
 $
 $
 $
L   \    r+   r        ?r1         4@Fmodelr{  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
           
        "#$ |t          d          |&d|| j        j        k                                    z
  }
n|}
|                    d          }| j                            ||
d          }|j        }t          | j        j        t                    r6| j        j        j
                            |d         j        d         |
          }
t          |                    d          |z  | j        j        z            }t          |                    d          |z  | j        j        z            }|                    |d| j        j                  }g }g }d }d}i "	 |dz  }| j        j        
                    ||          }| j        j                            |d d dd f         d ||
|d|d          }|r.|                    t'          j        |j        d                     |j                            d          }|j        }| j                            |          }|                    || j        j        | j        j                  }|                    |           |d d dd d f                             |d| j        j                  }t'          j        ||fd          }t'          j        | j                            |                    }||k     r~||k     rGt'          j        |d          |k    }t'          j        |          d                                         }ntA          tC          |                    }"fd	|D             }tC          |          dk    rht'          j"        |          }|#                    dd          $                    dd
          }| j        %                    |          }|D ]}||         "|<   tC          "          |k    rn"fdtA          tC          "                    D             }|	s|dk    r|d         n*t&          j&        j'        j(        )                    |d          }| ||          }n|}|rlt'          j        |d
          }|dk    rL |j        |t          |                    d          |z            g|                                dd          R  }||f}n*g #tA          |          D ]0} #                    ||                              d                     1|0t&          j&        j'        j(        )                    |d          }|#f}nKg $t&          j&        j'        j(        )                    |d          } ||          $#$fd#D             }!$|!f}|rit'          j        |d
          } |j        |t          |                    d          |z            g|                                dd          R  }g ||R }|S )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r,   r.   r@  r"   )r   r.   r  r  r  r  r  r@  r   c                     g | ]}|v|	S rC   rC   rD   r  result_spectrograms     r)   rF   z$_generate_speech.<locals>.<listcomp>
  s$    SSS!q@R7R7RA7R7R7Rr+   r   c                      g | ]
}|         S rC   rC   r  s     r)   rF   z$_generate_speech.<locals>.<listcomp>

  s    RRRa&q)RRRr+   )batch_firstc                 z    g | ]7}t                              d           t                    z            |z  8S r   )r;   r   r<   )rD   r  spectrogram_lengths	waveformss     r)   rF   z$_generate_speech.<locals>.<listcomp>$
  sB    sssZ[INN1$5$5<O8P8P$P Q QTU Usssr+   )*r&   r}   r   r;   r   r  r  rI  r  rY  r[  rG  r$   r-   r#   rk  r  r  rZ   r   r   r|  squeezer  speech_decoder_postnetr  r   sigmoidr  rN   rw  rP   rQ   rV   stackr   flattenr  r   r   rnnpad_sequence)%r  r,   r{  r.   r	  r
  r  r  r  r  r  r   encoder_outr  maxlenminlenoutput_sequencespectrogramr|  r  rQ  r  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesspectrograms
meet_indexr  r  waveform_lengthsr  r  r  s%                                     @@@r)   _generate_speechr0  	  s    !
 
 	
 !"lel6O&O%T%T%V%V!V!/


A

C.((!- )  K !, = %.(*IJJ 
!&!7!>!a!aN #%;"
 "
 *//22[@5<C``aaF*//22[@5<C``aaF 099#q%,B[\\OKO
C4q !& 6 = =oOa b bn,<</2337";#9+5 = 	
 	
 # 	T##EIk.JPQ$R$R$RSSS);CCAFF%5 /889LMM==el&CU\E^__8$$$ #111b!!!8,11#q%,:STT)_o$FANNN}U9BBCVWWXX<< V||"')Db"9"9"9Y"F${?;;A>EEGG$SYY//SSSS|SSSL<  1$$${;77+55a;;CCAqII$;CCLQQ". N NJ5A*5M&z22%&&#--i4j SRRR5=O9P9P3Q3QRRRL   3),l1ooux~7I7V7VWcqu7V7v7vgk**GG!G" 	2$y)9qAAAQww#8#3#8-22155;<<$?O?T?T?V?VWYWZWZ?[$ $ $   01G !s 	@ 	@A&&|A';';A'>'>????? 8>-::<UY:ZZL#%89GGI 8>-::<UY:ZZL--Isssss_rsss "23G" 	3$y)9qAAA4/4S)..q11C788 ;K;P;P;R;RSUSVSV;W      32!122GNr+   z8SpeechT5 Model with a text encoder and a speech decoder.c            '       N    e Zd ZdZdef fdZd Zd Z ee	           e
ee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%deej                 deej                 d	eej                 d
eej                 deej                 deej                 deej                 deeeej                                   deeeej                                   dee         dee         dee         dee         deej                 deej                 deej                 deeef         f"d                        Z ej                    	 	 	 	 	 	 	 	 d&dej        deej                 deej                 dededed eej                 d!ed"edeej        eej        ej        f         f         fd#            Z ej                    	 	 	 	 	 	 	 	 d&dej        deej                 deej                 dededed eej                 d!ed"edeej        eej        ej        f         f         fd$            Z xZS )'SpeechT5ForTextToSpeechr   r}   c                 @   t                                          |           |j        t          d| j         d          t          |          }t          |          }t          |||          | _        t          |          | _
        |                                  d S )Nr  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)ro   rp   r  r&   r   rc  r  r  r  r  r  r=  )r|   r}   text_encoderspeech_decoderr   s       r)   rp   z SpeechT5ForTextToSpeech.__init__6
  s       $/ / / /   5V<<8@@%flNKK&B6&J&J# 	r+   c                 4    | j                                         S r   r  rC  s    r)   r  z#SpeechT5ForTextToSpeech.get_encoderJ
  r  r+   c                 4    | j                                         S r   r  rC  s    r)   r  z#SpeechT5ForTextToSpeech.get_decoderM
  r  r+   r  Nr.   r  r  r>  r  ry  r  r  r  r  r?  r@  r{  r  r  r7   c                 
   ||n| j         j        }|.|t          || j         j        |          \  }}| j         j        rd}|                     |||||||||	|
|||d          }|                     |d                   \  }}}d}|)t          | j                   } |||||||j                  }|s|f|dd         z   }||f|z   n|S t          |||j
        |j        |j        |j        |j        |j        |j        	  	        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
            [`~PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
            for details.

        Returns:

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
        >>> import torch

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([15872])
        ```
        NTr,   r.   r  r  r>  r  ry  r  r  r  r{  r  r?  r@  r   r   	r  r%  r  r  r  r|  r  r  r  )r}   rK  r3   r-   r  r  r  r  r|  r   r  r  r  r  r  r  )r|   r   r.   r  r  r>  r  ry  r  r  r  r  r?  r@  r{  r  r  r  r  r  r  r  	criterionr  s                           r)   r   zSpeechT5ForTextToSpeech.forwardP
  s   B &1%<kk$+B]#+?WDK8:P@ @<$&< {4 )$(!--")!5#9/!5++1/!5   
 
" AE@[@[\cde\f@g@g= 5v/<<I9&%( D  	F+-;F)-)9TGf$$vE'-#3")"?&9$5&-&G")"?&9

 

 

 
	
r+   r  r1   r  Fr	  r
  r  r  r  r  c
                    |m|                     d          }|                     d          |k    r?|                     d          dk    r|                    |d          }nt          d          t          | |||||||||	
  
        S )aE  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Attention mask from the tokenizer, required for batched inference to signal to the model where to
                ignore padded tokens from the input_ids.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   rv  r&   r0  )r|   r   r.   r{  r	  r
  r  r  r  r  r  r_   s               r)   generatez SpeechT5ForTextToSpeech.generate
  s    J )"**J!&&q))Z77%**1--22);)B)B:q)Q)Q&&$o    #!
 
 	
r+   c
                    |m|                     d          }
|                     d          |
k    r?|                     d          dk    r|                    |
d          }nt          d          t          | |||||||||	
  
        S )a  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r=  )r|   r   r{  r.   r	  r
  r  r  r  r  r_   s              r)   generate_speechz'SpeechT5ForTextToSpeech.generate_speech&  s    R )"**J!&&q))Z77%**1--22);)B)B:q)Q)Q&&$o    #!
 
 	
r+   NNNNNNNNNNNNNNNNNNr  r1   r  NFF)r   r   r   r1  r   rp   r  r  r   r  r   r   r  r   r   re  rf  r   r   rS   r   r   r   r   r   r  r>  r@  r   r   s   @r)   r2  r2  /
  s       
 "O~      (+ + ++ + + +*+DEE+CRabbb 1559<@=A159=7;EIEI$(,0/3&*:>.2.2#v
 v
E,-v
 !!12v
 'u'89	v

 !))9 :v
 E-.v
 $E$56v
 'u|4v
 "%e.?(@"ABv
 "%e.?(@"ABv
 D>v
 $D>v
 'tnv
 d^v
 %U%67v
  *+!v
" el+#v
$ 
u..	/%v
 v
 v
 cb FEv
p U]__ 6::> !'+(-&+Y
 Y
#Y
 !!12Y
 %U%67	Y

 Y
 Y
 Y
 ")$Y
 "&Y
  $Y
 
u %(95;L(L"MM	NY
 Y
 Y
 _Y
v U]__ ;?59 !'+(-&+]
 ]
#]
 %U%67]
 !!12	]

 ]
 ]
 ]
 ")$]
 "&]
  $]
 
u %(95;L(L"MM	N]
 ]
 ]
 _]
 ]
 ]
 ]
 ]
r+   r2  z:SpeechT5 Model with a speech encoder and a speech decoder.c            '       &    e Zd Zdef fdZd Zd Zd Z ee	           e
ee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%deej                 d	eej                 d
eej                 deej                 deej                 deej                 deej                 deeeej                                   deeeej                                   dee         dee         dee         dee         deej                 deej                 deej                 deeef         f"d                        Z ej                    	 	 	 	 	 	 	 	 d&dej        deej                 d	eej                 deded ed!eej                 d"ed#edej        fd$            Z xZS )'SpeechT5ForSpeechToSpeechr}   c                    t                                          |           t          |          }t          |          }t	          |||          | _        t          |          | _        |                                  d S r   )	ro   rp   rY  r  r  r  r  r  r=  )r|   r}   r  r5  r   s       r)   rp   z"SpeechT5ForSpeechToSpeech.__init__  sp       8@@8@@%fnnMM&B6&J&J# 	r+   c                 4    | j                                         S r   r  rC  s    r)   r  z%SpeechT5ForSpeechToSpeech.get_encoder  r  r+   c                 4    | j                                         S r   r  rC  s    r)   r  z%SpeechT5ForSpeechToSpeech.get_decoder  r  r+   c                 \    |                                  j                                         dS r  r  rC  s    r)   rD  z0SpeechT5ForSpeechToSpeech.freeze_feature_encoder  r  r+   r  Nr,   r.   r  r  r>  r  ry  r  r  r  r  r?  r@  r{  r  r  r7   c                    ||n| j         j        }| |t          || j         j        |          \  }}|                     |||||||||	|
|||d          }|                     |d                   \  }}}d}|s|f|dd         z   }||f|z   n|S t          |||j        |j        |j	        |j
        |j        |j        |j        	  	        S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
            [`SpeechT5Processor.__call__`] for details.

        Returns:

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
        >>> from datasets import load_dataset
        >>> import torch

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([77824])
        ```
        NTr9  r   r   r:  )r}   rK  r3   r-   r  r  r   r  r  r  r|  r  r  r  )r|   r,   r.   r  r  r>  r  ry  r  r  r  r  r?  r@  r{  r  r  r  rE   r%  r  r  r  s                          r)   r   z!SpeechT5ForSpeechToSpeech.forward  s-   N &1%<kk$+B]#+?WDK8:P@ @<$&< --%)!5#9/!5++1/!5   
 
" "&!<!<WQZ!H!H; 	F!^gabbk1F)-)9TGf$$vE'##3")"?&9$5&-&G")"?&9

 

 

 
	
r+   r  r1   r  Fr	  r
  r  r  r  r  c
                 l    |t          j        d|j                  }t          | |||||||||	
  
        S )a  
        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
        speech waveform using a vocoder.

        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Float values of input raw speech waveform.

                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `List[float]` or
                a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install soundfile*). To prepare the array
                into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into a tensor
                of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        N)r   i   rO  )r   rR   r   r0  )
r|   r,   r{  r.   r	  r
  r  r  r  r  s
             r)   r@  z)SpeechT5ForSpeechToSpeech.generate_speech  sS    R %!&Xl>Q!R!R!R#!
 
 	
r+   rA  rB  )r   r   r   r   rp   r  r  rD  r   r  r   r   r  r   r   rf  re  r   r   rS   r   r   r   r   r   r  r@  r   r   s   @r)   rD  rD    s       

~ 
 
 
 
 
 
+ + ++ + +; ; ; +*+DEE+CRabbb 5959<@=A159=7;EIEI$(,0/3&*:>.2.2#p
 p
u01p
 !!12p
 'u'89	p

 !))9 :p
 E-.p
 $E$56p
 'u|4p
 "%e.?(@"ABp
 "%e.?(@"ABp
 D>p
 $D>p
 'tnp
 d^p
 %U%67p
  *+!p
" el+#p
$ 
u..	/%p
 p
 p
 cb FEp
d U]__ ;?59 !'+(-&+V
 V
'V
 %U%67V
 !!12	V

 V
 V
 V
 ")$V
 "&V
  $V
 
	V
 V
 V
 _V
 V
 V
 V
 V
r+   rD  aT  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`SpeechT5HifiGanConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
c                   :     e Zd Zd
 fd	ZddZd Zd Zd	 Z xZS )HifiGanResidualBlockr   r   r      皙?c                 d    t                                                       | _        t          j         fdt          t                              D                        _        t          j         fdt          t                              D                        _        d S )Nc                     g | ]<}t          j        d |                             |                             =S r   )rm   dilationr   r   rt   get_padding)rD   r  channelsrS  rl   r|   s     r)   rF   z1HifiGanResidualBlock.__init__.<locals>.<listcomp>  sf     
 
 
  	%a[ ,,[(1+FF  
 
 
r+   c                 l    g | ]0}t          j        d d                     d                     1S rR  rT  )rD   rE   rV  rl   r|   s     r)   rF   z1HifiGanResidualBlock.__init__.<locals>.<listcomp>  s^     
 
 
  	 ,,[!<<  
 
 
r+   )	ro   rp   leaky_relu_sloper   r  rQ   rV   convs1convs2)r|   rV  rl   rS  rX  r   s   ```` r)   rp   zHifiGanResidualBlock.__init__  s     0m
 
 
 
 
 
 
 s8}}--
 
 

 
 m
 
 
 
 
 
 s8}}--
 
 

 
r+   r   c                     ||z  |z
  dz  S r   rC   )r|   rl   rS  s      r)   rU  z HifiGanResidualBlock.get_padding  s    h&1a77r+   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        }| j        D ]} ||           | j        D ]} ||           d S Nr   )r   r   r   r   r   rY  rZ  r|   r   r  s      r)   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm  s    h*28,m<< 	@(3?K[ 	 	EK[ 	 	EK	 	r+   c                     | j         D ]!}t          j                            |           "| j        D ]!}t          j                            |           "d S r   )rY  r   r   remove_weight_normrZ  r|   r  s     r)   ra  z'HifiGanResidualBlock.remove_weight_norm  s`    [ 	/ 	/EH''....[ 	/ 	/EH''....	/ 	/r+   c                    t          | j        | j                  D ]l\  }}|}t          j                            || j                  } ||          }t          j                            || j                  } ||          }||z   }m|S r   )rZ  rY  rZ  r   r}  
leaky_relurX  )r|   r   conv1conv2r  s        r)   r   zHifiGanResidualBlock.forward  s    T[99 	5 	5LE5$HM44]DDYZZM!E-00MM44]DDYZZM!E-00M)H4MMr+   )r   rM  rO  r  )	r   r   r   rp   rU  r_  ra  r   r   r   s   @r)   rL  rL    s~        
 
 
 
 
 
>8 8 8 8  / / /      r+   rL  zHiFi-GAN vocoder.c                   b     e Zd ZeZdZdef fdZd Zd Zd Z	de
j        de
j        fdZ xZS )	SpeechT5HifiGanr%  r}   c                 |   t                                          |           t          |j                  | _        t          |j                  | _        t          j        |j	        |j
        ddd          | _        t          j                    | _        t          t          |j        |j                            D ]X\  }\  }}| j                            t          j        |j
        d|z  z  |j
        d|dz   z  z  ||||z
  dz                       Yt          j                    | _        t)          t          | j                            D ]a}|j
        d|dz   z  z  }t          |j        |j                  D ]4\  }}| j                            t-          ||||j                             5bt          j        |dddd          | _        |                     dt5          j        |j	                             |                     dt5          j        |j	                             |                                  d S )N   r   r   )rl   rm   r   r   r  r  )ro   rp   rV   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   rt   model_in_dimupsample_initial_channelconv_prer  	upsamplerrL  rZ  upsample_kernel_sizesrZ   ConvTranspose1d	resblocksrQ   resblock_dilation_sizesrL  rX  	conv_postr   r   rR   rX   r=  )r|   r}   r  upsample_raterl   rV  rS  r   s          r)   rp   zSpeechT5HifiGan.__init__  s5      v;<< !677	+
 
 
 /8V=RTZTp9q9q/r/r 		 		+A+{N!!"31=3a!eE +((=8Q>      s4>**++ 	v 	vA61Q<HH),V-I6Ki)j)j v v%X%%&:8[RZ\b\s&t&tuuuuv 8QAaQRSSSVU[1D%E%EFFFWej1D&E&EFFF 	r+   c                     t          |t          j        t          j        f          rR|j        j                            d| j        j                   |j	        "|j	        j        
                                 dS dS dS )zInitialize the weights.r1   r  N)r  r   r0  rt   r   r'  r#  r}   r(  rn   r)  )r|   r,  s     r)   r.  zSpeechT5HifiGan._init_weights  su    fry")455 	)M&&CT[5R&SSS{& &&(((((	) 	)&&r+   c                 8   t           j        j        }t          t           j        j        d          rt           j        j        j        } || j                   | j        D ]} ||           | j        D ]}|                                  || j	                   d S r]  )
r   r   r   r   r   rq  rr  ru  r_  rw  r^  s      r)   r_  z!SpeechT5HifiGan.apply_weight_norm  s    h*28,m<< 	@(3?KDM"""^ 	 	EK^ 	& 	&E##%%%%DN#####r+   c                 $   t           j                            | j                   | j        D ]!}t           j                            |           "| j        D ]}|                                 t           j                            | j                   d S r   )r   r   ra  rq  rr  ru  rw  rb  s     r)   ra  z"SpeechT5HifiGan.remove_weight_norm  s    
##DM222^ 	/ 	/EH''....^ 	' 	'E$$&&&&
##DN33333r+   r7   c                    | j         j        r|| j        z
  | j        z  }|                                dk    }|s|                    d          }|                    dd          }|                     |          }t          | j	                  D ]}t          j                            || j         j                  } | j        |         |          } | j        || j        z           |          }t          d| j                  D ]&}| | j        || j        z  |z            |          z  }'|| j        z  }t          j                            |          }|                     |          }t%          j        |          }|s=|                    d                              dd                              d          }n|                    d          }|S )a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.

        Args:
            spectrogram (`torch.FloatTensor`):
                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
                config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        r   r   r   r   r"   )r}   normalize_beforer  r  r   r   r   rq  rQ   rn  r   r}  rd  rX  rr  ru  rl  rw  r   tanhr  r   )r|   r%  
is_batchedr   r  	res_statejwaveforms           r)   r   zSpeechT5HifiGan.forward  s    ;' 	A&2dj@K __&&!+
 	3%//22K#--a33m44t)** 	9 	9AM44]DKD`aaM-DN1-m<<M<q4+;';<]KKI1d.// U UET^A0@,@1,DEmTTT		%(88MM00??}55
=11 	0$,,Q//99!Q??DDRHHHH %,,Q//Hr+   )r   r   r   r   r/  r1  rp   r.  r_  ra  r   rf  r   r   r   s   @r)   rh  rh    s        
 )L#O$4 $ $ $ $ $ $L) ) )
$ 
$ 
$4 4 4-5#4 -9J - - - - - - - -r+   rh  )r   Nr  rB  )dr   r   typingr   r   r   r   numpyrJ   r   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   r   r   configuration_speecht5r   r   
get_loggerr   r}  _HIDDEN_STATES_START_POSITIONr  r   r;   r*   r3   r   re  ndarrayrg   r  ri   r   r   r   r   r   r  r   r  r+  r7  rh  r  r  r  r  r  r  r  r  r  r  r4  rY  rc  rn  rt  r  r  r  r  r  SPEECHT5_BASE_START_DOCSTRINGSPEECHT5_START_DOCSTRINGr  r  r  rf  rS   r0  r2  rD  HIFIGAN_START_DOCSTRINGrL  rh  rC   r+   r)   <module>r     s1      / / / / / / / / / / / /                @ @ @ @ @ @ @ @ @ @ ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7 e e e e e e e e              . - - - - - t t t t t t t t t t t t I I I I I I I I 
	H	%	% !"  #%, c [^    " ei0 0,0250KSTYT`Ka0 0 0 04 26t tc?tt t U-.	t
 t Zt t t tp    29   ,       8       2C8 C8 C8 C8 C8BI C8 C8 C8N* * * * *bi * * *Z    ry   0" " " " " " " "(    29   + + + + +RY + + +^1 1 1 1 1	 1 1 1D D D D D") D D DN1 1 1 1 1") 1 1 1h% % % % % % % %P< < < < <29 < < <2    	   .(- (- (- (- (-	 (- (- (-V& & & & & & & & ]B ]B ]B ]B ]B	 ]B ]B ]B@    ")   0: : : : :29 : : :zi i i i i29 i i iX'? '? '? '? '?o '? '? '?TF
 F
 F
 F
 F
- F
 F
 F
R" " " " "&= " " "J' ' ' ' '$; ' ' 'T
 
 
 
 
#: 
 
 
@L
 L
 L
 L
 L
- L
 L
 L
^- - - - -&= - - -`1 1 1 1 1$; 1 1 1h( ( ( ( (#: ( ( (V8M 8M 8M 8M 8M29 8M 8M 8Mv: : : : :bi : : :z! 0 "S l r! M
 M
 M
 M
 M
+ M
 M
	 M
` B X X X X X5 X X	 X| 7;15#'$)"'L L"L#L !!23L U-.	L
 L L L bi L "L  L 5eE$5u7H$HIIJL L L L^ B Q
 Q
 Q
 Q
 Q
5 Q
 Q
	 Q
h
 D e
 e
 e
 e
 e
 7 e
 e
	 e
P "; ; ; ; ;29 ; ; ;|  r r r r ro r r	 r r rr+   