
    g4                       d Z ddlZddlZddlmZmZmZ ddlZddl	Z	ddl
mc mZ ddlZ	ddl	mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  e%j(        e)          Z*dZ+dZ,dZ-g dZ.dZ/dZ0dZ1ddgZ2dZ3dZ4	 	 dXdee5e5f         de6de5dee	j7                 de5dej8        fdZ9 G d d ej:                  Z; G d! d"ej:                  Z< G d# d$ej:                  Z= G d% d&ej:                  Z> G d' d(ej:                  Z? G d) d*ej:                  Z@ G d+ d,e@          ZA G d- d.ej:                  ZB G d/ d0ej:                  ZC G d1 d2ej:                  ZD G d3 d4ej:                  ZE G d5 d6ej:                  ZF G d7 d8ej:                  ZG G d9 d:ej:                  ZH G d; d<ej:                  ZI G d= d>ej:                  ZJ G d? d@ej:                  ZK G dA dBe          ZLdCZMdDZN e"dEeM           G dF dGeL                      ZO e"dHeM           G dI dJeL                      ZP e"dKeM           G dL dMeL                      ZQ e"dNeM           G dO dPeL                      ZR G dQ dRej:                  ZS G dS dTej:                  ZT e"dUeM           G dV dWeL                      ZUdS )YzPyTorch WavLM model.    N)OptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)BaseModelOutputCausalLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardis_peft_availablelogging   )WavLMConfig   r   z1patrickvonplaten/wavlm-libri-clean-100h-base-plus)r   i$  i   zZ'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'gQ)@zmicrosoft/wavlm-base-plus-sdzmicrosoft/wavlm-base-plus-svg
ףp=
?shape	mask_probmask_lengthattention_mask	min_masksreturnc                 @   | \  }dk     rt          d          k    rt          d d d          t          j                            d                                          fd}|9|                    d                                                                          nfd	t          |          D             }t          j	        |ft          
          }g }	 |          }
|
dk    r|S |D ]} ||          }t          j                            t          j        |dz
  z
            |d          }t          |          dk    rdz
  }n|d         }t          j        |t          j        |
|z
  t          j        
          |z  g          }|	                    |           t          j        |	          }	t          j        |	dddddf         ||
f          }	|	                    ||
z            }	t          j                  ddddf         }t          j        |||
f                              ||
z            }|	|z   }	|	                                dz
  k    rdz
  |	|	dz
  k    <   t          j        ||	dd           |S )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t          | z  z  z             }t          |          }|z  k    rz  }| dz
  z
  |k     rt          | dz
  z
  d          }|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr   r   r   sequence_lengths     d/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/wavlm/modeling_wavlm.pycompute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_spanr   s~    i,6DwNOOoy99 [(?::-<O ;?+o==!,+/"BAFFO    Nc                     g | ]}S  r/   ).0_r)   s     r*   
<listcomp>z)_compute_mask_indices.<locals>.<listcomp>   s    999!o999r,   dtyper   F)replace)
ValueErrornprandomranditemsumdetachtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper%   put_along_axis)r   r   r   r   r   
batch_sizer+   input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr&   r'   spec_aug_mask_idxdummy_mask_idxoffsetsr(   r)   s    `` `           @@r*   _compute_mask_indicesrT   L   sP   0 #(JQABBB_$$:^i : :'6: : :
 
 	
 innQ$$&&G        $ % 	2%%''..0009999uZ'8'8999  Hj/:$GGGM11/BBa% 5 511,?? I,,IlkAo677RW - 
 
  !!Q&& -q0NN.q1NN(;o(MUWU] ^ ^ ^ao op
 
 	!!"34444"455 111aaa:&5H+(V  ,33J@SVa@abb i$$T4]3Gog
4G'UVV^^'+5 G ,g5 /A"555GVYZGZ-!0CCD m%7B???r,   c                   &     e Zd Zd fd	Zd Z xZS )WavLMNoLayerNormConvLayerr   c                 Z   t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        d S )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      r*   r]   z"WavLMNoLayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@r,   c                 Z    |                      |          }|                     |          }|S N)re   rg   ri   hidden_statess     r*   forwardz!WavLMNoLayerNormConvLayer.forward   s*    		-0066r,   r   __name__
__module____qualname__r]   rq   __classcell__rl   s   @r*   rV   rV      sR        A A A A A A      r,   rV   c                   &     e Zd Zd fd	Zd Z xZS )WavLMLayerNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          j        | j        d          | _        t          |j                 | _        d S )Nr   r   rX   T)elementwise_affine)r\   r]   r^   r_   r`   r   ra   rb   rc   rd   re   	LayerNorm
layer_normr	   rf   rg   rh   s      r*   r]   z WavLMLayerNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 ,t'8TRRR !?@r,   c                     |                      |          }|                    dd          }|                     |          }|                    dd          }|                     |          }|S )Nr-   )re   	transposer~   rg   ro   s     r*   rq   zWavLMLayerNormConvLayer.forward   se    		-00%//B7766%//B7766r,   rr   rs   rx   s   @r*   rz   rz      sR        A A A A A A      r,   rz   c                   &     e Zd Zd fd	Zd Z xZS )WavLMGroupNormConvLayerr   c                    t                                                       |dk    r|j        |dz
           nd| _        |j        |         | _        t          j        | j        | j        |j        |         |j        |         |j	                  | _
        t          |j                 | _        t          j        | j        | j        d          | _        d S )Nr   r   rX   T)
num_groupsnum_channelsaffine)r\   r]   r^   r_   r`   r   ra   rb   rc   rd   re   r	   rf   rg   	GroupNormr~   rh   s      r*   r]   z WavLMGroupNormConvLayer.__init__   s    <DqLL6?8a<88a"OH5I*84%h/!
 
 
	 !!?@,$2CRVRclpqqqr,   c                     |                      |          }|                     |          }|                     |          }|S rn   )re   r~   rg   ro   s     r*   rq   zWavLMGroupNormConvLayer.forward  s;    		-006666r,   rr   rs   rx   s   @r*   r   r      sR        r r r r r r       r,   r   c                   $     e Zd Z fdZd Z xZS )WavLMPositionalConvEmbeddingc                    t                                                       t          j        |j        |j        |j        |j        dz  |j                  | _        t          j        j	        }t          t          j        j        d          rt          j        j        j	        }t                      rdd l}|j                            | j        j        d          5   || j        dd          | _        d d d            n# 1 swxY w Y   t          | j        d          r-| j        j        j        j        }| j        j        j        j        }n| j        j        }| j        j        }|j                            | |           |j                            | |           n || j        dd          | _        t-          |j                  | _        t0          |j                 | _        d S )	Nr   )rY   paddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r\   r]   r   ra   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsre   utilsr   hasattrr   r
   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterWavLMSamePadLayerr   r	   rf   rg   )ri   rj   r   r   r   r   rl   s         r*   r]   z%WavLMPositionalConvEmbedding.__init__  s   I62a77
 
 
	 h*28,m<< 	@(3?K%'' 	E22493CST2UU I I'K	aHHH	I I I I I I I I I I I I I I Ity"455 .95<F95<F9-9-N66tXFFFN66tXFFFF#DIH!DDDDI()GHH !?@s   C??DDc                     |                     dd          }|                     |          }|                     |          }|                     |          }|                     dd          }|S Nr   r   )r   re   r   rg   ro   s     r*   rq   z$WavLMPositionalConvEmbedding.forward1  se    %//155		-00]3366%//155r,   rs   rx   s   @r*   r   r     sM        A A A A AB      r,   r   c                   $     e Zd Z fdZd Z xZS )r   c                 l    t                                                       |dz  dk    rdnd| _        d S Nr   r   r   )r\   r]   num_pad_remove)ri   r   rl   s     r*   r]   zWavLMSamePadLayer.__init__>  s:    #:Q#>!#C#Caar,   c                 J    | j         dk    r|d d d d d | j          f         }|S Nr   )r   ro   s     r*   rq   zWavLMSamePadLayer.forwardB  s;    "")!!!QQQ0F43F2F0F*FGMr,   rs   rx   s   @r*   r   r   =  sL        K K K K K      r,   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )WavLMFeatureEncoderz.Construct the features from raw audio waveformc                    t                                                       j        dk    r7t          d          gfdt	          j        dz
            D             z   }nDj        dk    r!fdt	          j                  D             }nt          dj         d	          t          j        |          | _	        d
| _
        d| _        d S )Ngroupr   rk   c                 8    g | ]}t          |d z             S )r   r   )rV   r0   irj   s     r*   r2   z0WavLMFeatureEncoder.__init__.<locals>.<listcomp>P  s>     K K KFG)&1q5AAAK K Kr,   r   layerc                 2    g | ]}t          |           S )r   )rz   r   s     r*   r2   z0WavLMFeatureEncoder.__init__.<locals>.<listcomp>T  s'    vvv126AFFFvvvr,   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r\   r]   feat_extract_normr   r>   num_feat_extract_layersr6   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)ri   rj   r   rl   s    ` r*   r]   zWavLMFeatureEncoder.__init__L  s   #w..26AFFFG K K K KKPQWQorsQsKtKtK K K KK %00vvvvPUV\VtPuPuvvvKKt1Ittt   =55&+#"r,   c                 P    |                                  D ]	}d|_        
d| _        d S )NF)
parametersrequires_gradr   ri   params     r*   _freeze_parametersz&WavLMFeatureEncoder._freeze_parameters]  s4    __&& 	( 	(E"'E#r,   c                     |d d d f         }| j         r| j        rd|_        | j        D ]>}| j         r*| j        r#| j        r|                     |j        |          }3 ||          }?|S )NT)r   trainingr   r   r   _gradient_checkpointing_func__call__)ri   input_valuesrp   
conv_layers       r*   rq   zWavLMFeatureEncoder.forwardb  s    $QQQW-  	/4= 	/*.M'* 	: 	:J" :t'B :t} : $ A A'!! !
 !+
= 9 9r,   )rt   ru   rv   __doc__r]   r   rq   rw   rx   s   @r*   r   r   I  s\        88# # # # #"$ $ $
      r,   r   c                        e Zd Z fdZ xZS )WavLMFeatureExtractorc                     t                                          |           t          j        d| j        j         d| j        j        d         j         dt                     d S )NzThe class `zD` has been depreciated and will be removed in Transformers v5. Use `r   z
` instead.)r\   r]   warningswarnrl   rt   	__bases__FutureWarningri   rj   rl   s     r*   r]   zWavLMFeatureExtractor.__init__v  sy       E$.1 E EN,Q/8E E E 		
 	
 	
 	
 	
r,   )rt   ru   rv   r]   rw   rx   s   @r*   r   r   u  s8        
 
 
 
 
 
 
 
 
r,   r   c                   $     e Zd Z fdZd Z xZS )WavLMFeatureProjectionc                 .   t                                                       t          j        |j        d         |j                  | _        t          j        |j        d         |j                  | _	        t          j
        |j                  | _        d S )Nr-   eps)r\   r]   r   r}   r^   layer_norm_epsr~   Linearr   
projectionDropoutfeat_proj_dropoutdropoutr   s     r*   r]   zWavLMFeatureProjection.__init__  sn    ,vr':@UVVV)FOB$79KLLz&":;;r,   c                     |                      |          }|                     |          }|                     |          }||fS rn   )r~   r   r   )ri   rp   norm_hidden_statess      r*   rq   zWavLMFeatureProjection.forward  sC    !__];;(:;;]33000r,   rs   rx   s   @r*   r   r     sG        < < < < <1 1 1 1 1 1 1r,   r   c                       e Zd ZdZ	 	 	 	 ddededed	ed
edef fdZ	 	 	 	 ddej	        de
ej	                 de
ej	                 dedeej	        e
ej	                 e
eej	                          f         f
dZdej        deej        ej        f         dej        dedej        ej        ff
dZdededej        fdZdej        dej        fdZ xZS )WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        @     T	embed_dim	num_headsr   num_bucketsmax_distancehas_relative_position_biasc                    t                                                       || _        || _        || _        ||z  | _        | j        |z  | j        k    rt          d| j         d| d          | j        dz  | _        t          j	        ||          | _
        t          j	        ||          | _        t          j	        ||          | _        t          j	        ||          | _        || _        || _        t          j        t#          j        d| j        dd                    | _        t          j	        | j        d          | _        |r&t          j        | j        | j                  | _        d S d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )r\   r]   r   r   r   head_dimr6   scalingr   r   k_projv_projq_projout_projr   r   	ParametertorchrE   gru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)ri   r   r   r   r   r   r   rl   s          r*   r]   zWavLMAttention.__init__  sa    	""!Y.MI%$.883dn 3 3%.3 3 3   }d*i	955i	955i	955	)Y77&(!#ejDNAq.Q.Q!R!R"$)DM1"="=% 	Q"$,t/?"P"PD	Q 	Qr,   NFr   rp   r   position_biasoutput_attentionsr    c                 F   |                                 \  }}}|^|                     ||          }|                    d                              |ddd                              || j        z  ||          }|                    |j        dd         | j        dfz             }	|	                    dddd          }	|                     |	          }
|
                    |	j        dd         dz             	                    d          }
t          j        |
                              dd          \  }}||| j        z  d	z
  z  d
z   }|                    || j        z  dd          |z  }|                    d||f          }|                     ||||          \  }}|||fS )z'Attention layer with relative attentionNr   r   r-   r   r   )r      r         ?g       @)sizecompute_bias	unsqueezerepeatviewr   r   permuter   r;   r   sigmoidchunkr   torch_multi_head_self_attention)ri   rp   r   r   r   indexbsztgt_lenr1   gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightss                    r*   rq   zWavLMAttention.forward  s    (,,..Wa   --gw??M''**11#q!Q??DDS4>EY[bdkll  ,001DSbS1IT^]_L`1`aa199!Q1EE "&!8!89L!M!M!7!<!<=P=VWZXZWZ=[^d=d!e!e!i!ijl!m!m '=>>DDQBDOO)? ?# EFL *..sT^/CRKKm[166GW7MNN$($H$H>+>@Q%
 %
!\ L-77r,   r  c                    |                     dd          x}x}}||                    d          nd}dx}	}
d}t          j        |||| j        | j        t          j        dg          t          j        | j	        j
        | j        j
        | j        j
        f          |	|
|| j        | j        j        | j        j
        | j        |||d| j	        j        | j        j        | j        j                  \  }}|                     dd          }|E|dddf                             |j        dd         | j        fz   |j        dd         z             }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)r   neFmulti_head_attention_forwardr   r   r   emptycatr   r[   r   r   r   r   r   r   rI   r   )ri   rp   r   r  r   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnr  r  s                 r*   r  z.WavLMAttention.torch_multi_head_self_attention  sx    ,55a;;;;e3A3M>,,Q///SW  %&$BNNKIt{')94;;KLMMLM MM%)+,+,+,+%
 %
 %
!\2 "++Aq11# (40=="2A2&$.)::\=OPQPRPR=SS L L((r,   query_length
key_lengthc                    t          j        |t           j                  d d d f         }t          j        |t           j                  d d d f         }||z
  }|                     |          }|                    | j        j        j                  }|                     |          }|                    g d          }|S )Nr3   )r   r   r   )	r   rB   long_relative_positions_buckettor   r   devicer   )ri   r   r!  context_positionmemory_positionrelative_positionrelative_position_bucketvaluess           r*   r   zWavLMAttention.compute_bias  s     <EJGGG4P,zDDDT111WM+.>>#'#B#BCT#U#U #;#>#>t?R?Y?`#a#a $$%=>>			**r,   relative_positionsc                    | j         dz  }|dk                        t          j                  |z  }t          j        |          }|dz  }||k     }t          j        |                                |z            }|t          j        | j        |z            z  }|||z
  z  }||z                       t          j                  }t          j	        |t          j
        ||dz
                      }|t          j        |||          z  }|S r   )r   r%  r   r#  abslogfloatmathr   min	full_likewhere)ri   r,  r   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larges           r*   r$  z)WavLMAttention._relative_positions_bucket!  s   &!+.266uzBB[P"Y'9::1$	%	1&+i0B0H0H0J0JY0V&W&W#&ADHTM^ajMjDkDk&k#&A[S\E\&]#&/2M&M%Q%QRWR\%]%]"%*Y&8RT_bcTc(d(d&
 &
" 	EK2DF`aaar,   )r   r   r   TNNFr   )rt   ru   rv   r   r$   r0  r@   r]   r   Tensorr   r   rq   FloatTensorr   
LongTensor
BoolTensorr  r   r$  rw   rx   s   @r*   r   r     s       GG +/"Q "Q"Q "Q 	"Q
 "Q "Q %)"Q "Q "Q "Q "Q "QN 2604"''8 '8|'8 !.'8  -	'8
  '8 
u|Xel3XeEL>Q5RR	S'8 '8 '8 '8R5)(5) e.0@@A5) #.	5)
  5) 
U.	/5) 5) 5) 5)n # %BS     U=N  SXSd                r,   r   c                   $     e Zd Z fdZd Z xZS )WavLMFeedForwardc                    t                                                       t          j        |j                  | _        t          j        |j        |j                  | _	        t          |j        t                    rt          |j                 | _        n|j        | _        t          j        |j        |j                  | _        t          j        |j                  | _        d S rn   )r\   r]   r   r   activation_dropoutintermediate_dropoutr   r   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     r*   r]   zWavLMFeedForward.__init__8  s    $&Jv/H$I$I!"$)F,>@X"Y"Yf'-- 	9'-f.?'@D$$'-'8D$If&>@RSS j)>??r,   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S rn   )rE  rI  rC  rJ  rL  ro   s     r*   rq   zWavLMFeedForward.forwardE  sg    //>>00??11-@@))-88++M::r,   rs   rx   s   @r*   r@  r@  7  sL        @ @ @ @ @      r,   r@  c                   2     e Zd Zd	dedef fdZd
dZ xZS )WavLMEncoderLayerTrj   r   c                    t                                                       t          |j        |j        |j        |j        |j        |          | _        t          j
        |j                  | _        t          j        |j        |j                  | _        t!          |          | _        t          j        |j        |j                  | _        d S N)r   r   r   r   r   r   r   r\   r]   r   r   num_attention_headsattention_dropoutr   max_bucket_distance	attentionr   r   rK  r   r}   r   r~   r@  feed_forwardfinal_layer_normri   rj   r   rl   s      r*   r]   zWavLMEncoderLayer.__init__P      '(0,*3'A
 
 
 z&"788,v'9v?TUUU,V44 "V-?VEZ [ [ [r,   NFr   c                    |}|                      |||||          \  }}}|                     |          }||z   }|                     |          }||                     |          z   }|                     |          }||f}|r||fz  }|S )Nr   r   r   r  )rV  r   r~   rW  rX  )	ri   rp   r   r   r   r  attn_residualr  outputss	            r*   rq   zWavLMEncoderLayer.forward_  s    %59^^)'/ 6D 6
 6
2|] ]33%566%(9(9-(H(HH--m<< -0 	'&Gr,   Tr:  rt   ru   rv   r   r@   r]   rq   rw   rx   s   @r*   rO  rO  O  sm        \ \{ \ \ \ \ \ \ \       r,   rO  c                   2     e Zd Zddedef fdZd	dZ xZS )
 WavLMEncoderLayerStableLayerNormTrj   r   c                    t                                                       t          |j        |j        |j        |j        |j        |          | _        t          j
        |j                  | _        t          j        |j        |j                  | _        t!          |          | _        t          j        |j        |j                  | _        d S rQ  rR  rY  s      r*   r]   z)WavLMEncoderLayerStableLayerNorm.__init__y  rZ  r,   NFc                    |}|                      |          }|                     ||||          \  }}}|                     |          }||z   }||                     |                     |                    z   }||f}|r||fz  }|S )N)r   r   r   )r~   rV  r   rW  rX  )ri   rp   r   r   r   r]  r  r^  s           r*   rq   z(WavLMEncoderLayerStableLayerNorm.forward  s    %6659^^)'/	 6D 6
 6
2|] ]33%5%(9(9$:O:OP]:^:^(_(__ -0 	'&Gr,   r_  )NNFr`  rx   s   @r*   rb  rb  x  sm        \ \{ \ \ \ \ \ \ \       r,   rb  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 :    g | ]}t          |d k              S r   )r   )rO  r   s     r*   r2   z)WavLMEncoder.__init__.<locals>.<listcomp>  s,    uuuPQv16KKKuuur,   Fr\   r]   rj   r   pos_conv_embedr   r}   r   r   r~   r   rK  r   r   r>   num_hidden_layerslayersr   r   s    `r*   r]   zWavLMEncoder.__init__  s    :6BB,v'9v?TUUUz&"788muuuuUZ[a[sUtUtuuu
 
 ',###r,   NFTc                    |rdnd }|rdnd }|d|| <   |                      |          }||z   }|                     |          }|                     |          }t                      pt	          |           }	d }
t          | j                  D ]\  }}|r||fz   }t          j        g           }| j	        o|dk    o|| j
        j        k     }|r|	rJ| j        r&| j	        r|                     |j        |||
|          }n ||||
||          }|d d         \  }}
|rd}|r||d         fz   }|r||fz   }|st          d |||fD                       S t!          |||          S )	Nr/   r   r   r\  r   NNNc              3      K   | ]}||V  	d S rn   r/   r0   vs     r*   	<genexpr>z'WavLMEncoder.forward.<locals>.<genexpr>  (      mmq_`_l_l_l_l_lmmr,   last_hidden_staterp   
attentions)rk  r~   r   r
   r   	enumeraterm  r   r9   r   rj   	layerdropr   r   r   tupler   ri   rp   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsposition_embeddingssynced_gpusr   r   r   dropout_probabilityskip_the_layerlayer_outputss                   r*   rq   zWavLMEncoder.forward  s    #7@BBD$5?bb4%-0M>/*"11-@@%(;;66]33022R6LT6R6R!$+.. !	P !	PHAu# I$58H$H! #(*R..!]fq1uf:MPTP[Pe:eN! A[ A. 4= $($E$E%&%)% %MM %*E%'5&3*;% % %M 0=RaR/@,} 3 2  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r,   NFFTrs   rx   s   @r*   rf  rf    sb        	, 	, 	, 	, 	, "C
 C
 C
 C
 C
 C
 C
 C
r,   rf  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderStableLayerNormc                    t                                                       | _        t                    | _        t          j        j        j                  | _	        t          j
        j                  | _        t          j        fdt          j                  D                       | _        d| _        d S )Nr   c                 :    g | ]}t          |d k              S ri  )rb  r   s     r*   r2   z8WavLMEncoderStableLayerNorm.__init__.<locals>.<listcomp>  s=        1UVZ[U[]]]  r,   Frj  r   s    `r*   r]   z$WavLMEncoderStableLayerNorm.__init__  s    :6BB,v'9v?TUUUz&"788m   v788  
 
 ',###r,   NFTc                    |rdnd }|rdnd }|d|| <   |                      |          }||z   }|                     |          }t                      pt          |           }	d }
t	          | j                  D ]\  }}|r||fz   }t          j        g           }| j        o|dk    o|| j	        j
        k     }|r|	rI| j        r&| j        r|                     |j        |||
|          }n |||||
          }|d d         \  }}
|rd}|r||d         fz   }|                     |          }|r||fz   }|st          d |||fD                       S t!          |||          S )Nr/   r   )r   r   r   r   ro  c              3      K   | ]}||V  	d S rn   r/   rq  s     r*   rs  z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr><  rt  r,   ru  )rk  r   r
   r   rx  rm  r   r9   r   rj   ry  r   r   r   r~   rz  r   r{  s                   r*   rq   z#WavLMEncoderStableLayerNorm.forward  s    #7@BBD$5?bb4%-.M>/*"11-@@%(;;]33022R6LT6R6R!$+..  	P  	PHAu# I$58H$H! #(*R..!]fq1uf:MPTP[Pe:eN! A[ A . 4= $($E$E%&%)% %MM %*E%'5*;&3	% % %M 0=RaR/@,} 3 2  P&9]1=M<O&O#66 	E 1]4D D 	nmm]4EGZ$[mmmmmm+;LYl
 
 
 	
r,   r  rs   rx   s   @r*   r  r    sb        , , , , ," "A
 A
 A
 A
 A
 A
 A
 A
r,   r  c                   >     e Zd ZdZ fdZed             Zd Z xZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
    c                    t                                                       |j        | _        |j        | _        |j        | j        z  dk    r t          d|j         d| j         d          t          j	        t          j        d| j        | j        z  |j        | j        z                      | _        t          j        |j        d         | j        | j        z            | _        d| _        d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   r-   r   )r\   r]   num_codevector_groupsr   num_codevectors_per_groupnum_varscodevector_dimr6   r   r   r   r<  codevectorsr   r^   weight_projtemperaturer   s     r*   r]   z#WavLMGumbelVectorQuantizer.__init__H  s     68 4?2a77%&*? % %6:o% % %   <a4=!@&BW[_[jBjkk
 
 9V_R%8$/DM:YZZ r,   c           	          |                      d          }t          j        t          j        |t          j        |dz             z  d                                                     }|S )Nr   r   gHz>r-   )meanr   expr;   r/  )probsmarginal_probs
perplexitys      r*   _compute_perplexityz.WavLMGumbelVectorQuantizer._compute_perplexity]  s^    **Y	.59^VZEZ;[;[*[ac d d ddeeiikk
r,   c                    |j         \  }}}|                     |          }|                    ||z  | j        z  d          }| j        rt
          j                            |                                | j	        d          }|
                    |          }t          j        |                    ||z  | j        d                                          d          }|                     |          }n|                    d          } |j        |j                              d|                    dd          d          }|                    ||z  | j        d          }|                     |          }|                    ||z  d          }|                    d          | j        z  }	|	                    ||z  | j        | j        d          }
|
                    d                              ||d          }
|
|fS )Nr-   T)tauhardr   r   r   r   )r   r  r   r   r   r   
functionalgumbel_softmaxr0  r  type_asr   softmaxr  argmax	new_zerosscatter_r   r  r  r;   )ri   rp   rL   r)   r   codevector_probscodevector_soft_distr  codevector_idxcodevectors_per_groupr  s              r*   rq   z"WavLMGumbelVectorQuantizer.forwardc  s   3@3F0
O[ ((77%**:+G$/+Y[]^^= 	D!};;M<O<O<Q<QW[Wgnr;ss/77FF $)="":#?RTUU[[]]ce$ $ $  112FGGJJ +11b199N6}68KLUUN''A..     044Z/5QSWSbdfgg112BCCJ+00o1MrRR 0 : :2 > >AQ Q+00o1Mt`d`moqrr!oob))..z?BOOJ&&r,   )	rt   ru   rv   r   r]   staticmethodr  rq   rw   rx   s   @r*   r  r  B  sl         
    *   \
"' "' "' "' "' "' "'r,   r  c                   $     e Zd Z fdZd Z xZS )WavLMAdapterc                    t                                                       j        j        k    rCt	          j        j        j                  | _        t	          j        j                  | _        nd x| _        | _        t	          j	        fdt          j                  D                       | _        j        | _        d S )Nc              3   6   K   | ]}t                    V  d S rn   )WavLMAdapterLayer)r0   r1   rj   s     r*   rs  z(WavLMAdapter.__init__.<locals>.<genexpr>  s,      #h#h!$5f$=$=#h#h#h#h#h#hr,   )r\   r]   output_hidden_sizer   r   r   projr}   proj_layer_normr   r>   num_adapter_layersrm  ry  r   s    `r*   r]   zWavLMAdapter.__init__  s     $(:::	&"4f6OPPDI#%<0I#J#JD  /33DI,m#h#h#h#huVMfGgGg#h#h#hhh)r,   c                 X   | j         1| j        *|                      |          }|                     |          }|                    dd          }| j        D ]=}t          j                                        }| j        r|| j        k    r ||          }>|                    dd          }|S r   )r  r  r   rm  r7   r8   r   ry  )ri   rp   r   layerdrop_probs       r*   rq   zWavLMAdapter.forward  s    9 T%9%E IIm44M 00??M%//155[ 	5 	5EY--//N= 5^dn%D%D %m 4 4%//155r,   rs   rx   s   @r*   r  r    sG        * * * * *      r,   r  c                   $     e Zd Z fdZd Z xZS )r  c                     t                                                       t          j        |j        d|j        z  |j        |j        d          | _        d S )Nr   r   )rZ   r   )r\   r]   r   ra   r  adapter_kernel_sizeadapter_stridere   r   s     r*   r]   zWavLMAdapterLayer.__init__  sU    I%))&(
 
 
			r,   c                 r    |                      |          }t          j                            |d          }|S )Nr   r   )re   r   r  gluro   s     r*   rq   zWavLMAdapterLayer.forward  s3    		-00))-Q)??r,   rs   rx   s   @r*   r  r    sG        
 
 
 
 
      r,   r  c                       e Zd ZdZeZdZdZdZd Z		 dde
ej        ef         dee         fd	Z	 dd
edej        fdZdS )WavLMPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    wavlmr   Tc           
      \   t          |t                    ro|j        j        j                            dd           |j        j        j                                         t          j	        
                    |j                   dS t          |t                    rt          j	                            |j        j        ddt          j        d|j        j        d         |j        j        z  z            z             t          j	                            |j        j        d           dS t          |t&                    r}t          j        d|j        j        z            }t          j	        
                    |j        j        | |           t          j	        
                    |j        j        | |           dS t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j                                         dS dS t          |t          j        t          j        f          r?|j        j                                         |j        j                            d           dS t          |t          j                  rt          j	                            |j                   |j        [t          j        |j        |j        |j        d         z  z            }t          j	        
                    |j        | |           dS dS dS )	zInitialize the weightsr   r   )r  stdr   r   )abNr   )rF  r  r  r   datanormal_r[   zero_r   inituniform_r  r   re   r1  sqrtrY   in_channels	constant_r   r   in_featuresr   rj   initializer_ranger}   r   fill_ra   kaiming_normal_r   )ri   moduleks      r*   _init_weightsz"WavLMPreTrainedModel._init_weights  s    f899 	9%*222CCC#(..000GV/00000 <== 	9GOO"	!v{'>q'AFKD['["\]]]    
 Gfk.22222 677 	9	!f/;;<<AGV.5!qAAAGV.3rQ?????	** 	9M&&CT[5R&SSS{& &&((((( '&r| <== 	9K""$$$M$$S)))))	** 	9G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888	9 	9 '&r,   NrM   add_adapterc                    || j         j        n|}d }t          | j         j        | j         j                  D ]\  }} ||||          }|r3t          | j         j                  D ]} ||d| j         j                  }|S )zH
        Computes the output length of the convolutional layers
        Nc                 <    t          j        | |z
  |d          dz   S )Nfloor)rounding_moder   )r   divr&   rY   rZ   s      r*   _conv_out_lengthzOWavLMPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s&     9\K7wWWWZ[[[r,   r   )rj   r  ziprb   rc   r>   r  r  )ri   rM   r  r  rY   rZ   r1   s          r*    _get_feat_extract_output_lengthsz5WavLMPreTrainedModel._get_feat_extract_output_lengths  s     2=1Ddk--+	\ 	\ 	\
 $'t{'>@W#X#X 	Q 	QK,,]KPPMM 	_4;9:: _ _ 0 04;C] ^ ^r,   feature_vector_lengthr   c                    |                     d          d d df         }|                     ||          }|                    t          j                  }|j        d         }t          j        ||f|j        |j                  }d|t          j	        |j        d         |j                  |dz
  f<   |
                    dg                               d          
                    dg                                          }|S )Nr-   r   r  r   )r4   r&  r   )r&  )cumsumr  r%  r   r#  r   r?   r4   r&  rB   flipr@   )ri   r  r   r  non_padded_lengthsoutput_lengthsrL   s          r*   "_get_feature_vector_attention_maskz7WavLMPreTrainedModel._get_feature_vector_attention_mask  s   
 ,22r2::111b5A>>?Q_j>kk'**5:66#)!,
./~7KTbTi
 
 
 uv^%9!%<^EZ[[[]kno]opq',,bT2299"==BBB4HHMMOOr,   rn   )rt   ru   rv   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr  r   r   r=  r$   r   r@   r  r  r/   r,   r*   r  r    s         
 L$O&*#9 9 9D Z^ "5#3S#89HPQU   0 Y] %(:?:J     r,   r  a  
    WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
    Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
    Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
    Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`WavLMConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aI  
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            <Tip warning={true}>

            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
            **not** be passed to avoid degraded performance when doing batched inference. For such models
            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
            models also yield slightly different results depending on whether `input_values` is padded or not.

            </Tip>

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z_The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.c                   t    e Zd Zdef fdZd Zd Z	 	 ddej        de	ej                 de	ej
                 fd	Z ee           eeeed
e          	 	 	 	 	 dde	ej                 de	ej                 de	ej                 de	e         de	e         de	e         deeef         fd                        Z xZS )
WavLMModelrj   c                    t                                          |           || _        t          |          | _        t          |          | _        |j        dk    s|j        dk    rBt          j
        t          j        |j                                                            | _        |j        rt#          |          | _        nt'          |          | _        |j        rt+          |          nd | _        |                                  d S )Nr   )r\   r]   rj   r   feature_extractorr   feature_projectionmask_time_probmask_feature_probr   r   r   r;  r   r  masked_spec_embeddo_stable_layer_normr  encoderrf  r  r  adapter	post_initr   s     r*   r]   zWavLMModel.__init__R  s       !4V!<!<"8"@"@  3&&&*BS*H*H%'\%,v?Q2R2R2[2[2]2]%^%^D"& 	06v>>DLL'//DL/5/AK|F+++t 	r,   c                 b    t          j        dt                     |                                  dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.Nr   r   r   freeze_feature_encoderri   s    r*   freeze_feature_extractorz#WavLMModel.freeze_feature_extractorf  ;    
 	Q	
 	
 	

 	##%%%%%r,   c                 8    | j                                          dS 
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r   r  s    r*   r  z!WavLMModel.freeze_feature_encoderr  s    
 	1133333r,   Nrp   mask_time_indicesr   c                    t          | j        dd          s|S |                                \  }}}|#| j                            |j                  ||<   n| j        j        dk    r| j        r|t          ||f| j        j        | j        j	        || j        j
                  }t          j        ||j        t          j                  }| j                            |j                  ||<   | j        j        dk    r| j        rt          ||f| j        j        | j        j        | j        j                  }t          j        ||j        t          j                  }|dddf                             d|d          }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        apply_spec_augmentTNr   )r   r   r   r   )r&  r4   )r   r   r   r-   )getattrrj   r   r  r%  r4   r  r   rT   mask_time_lengthmask_time_min_masksr   tensorr&  r@   r  mask_feature_lengthmask_feature_min_masksexpand)ri   rp   r   r   rL   r)   r   mask_feature_indicess           r*   _mask_hidden_stateszWavLMModel._mask_hidden_statesy  s    t{$8$?? 	!   4A3E3E3G3G0
O[(/3/E/H/HI\/]/]M+,,['!+++ 5_-+4 K8-+9! ! ! !&->}G[chcm n n n/3/E/H/HI\/]/]M+,;(1,,,#8[)+7 K;+<	$ $ $  $)<0D]Mainis#t#t#t #74#@#G#GO]_#`#` 23M./r,   audio
checkpointoutput_typer  modalityexpected_outputr   r   r|  r}  r    c                 :   ||n| j         j        }||n| j         j        }||n| j         j        }|                     |          }|                    dd          }|#|                     |j        d         |d          }|                     |          \  }}| 	                    |||          }| 
                    |||||          }	|	d         }| j        |                     |          }|s||f|	dd          z   S t          |||	j        |	j                  S )	Nr   r   Fr  )r   r   r   r   r|  r}  r   )rv  extract_featuresrp   rw  )rj   r   r|  use_return_dictr  r   r  r   r  r  r  r  r   rp   rw  )
ri   r   r   r   r   r|  r}  r  rp   encoder_outputss
             r*   rq   zWavLMModel.forward  s|   " 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]11,??+55a;;%!DD &q)>u E  N +/*A*ABR*S*S''00->~ 1 
 
 ,,)/!5# ' 
 
 (*<# LL77M 	K!#34qrr7JJJ&+-)7&1	
 
 
 	
r,   )NNNNNNN)rt   ru   rv   r   r]   r  r  r   r<  r   r=  r  r   WAVLM_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr;  r@   r   r   rq   rw   rx   s   @r*   r  r  L  s       {      (
& 
& 
&4 4 4 :>59	, ,(, $E$56, !!12	, , , ,\ +*+ABB&+$.   269=,0/3&*2
 2
u|,2
 !.2
 $E$56	2

 $D>2
 'tn2
 d^2
 
u--	.2
 2
 2
  CB2
 2
 2
 2
 2
r,   r  zcWavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).c                   >    e Zd Zddee         f fdZd Zd Zd Zd Z	 e
e           eeeeee          	 	 	 	 	 dd	eej                 d
eej                 dee         dee         dee         deej                 deeef         fd                        Z xZS )WavLMForCTCNtarget_langc                    t                                          |           t          |          | _        t	          j        |j                  | _        || _        |j	        t          d| j         d          t          |d          r|j        r|j        n|j        }t	          j        ||j	                  | _        |                                  d S )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r  )r\   r]   r  r  r   r   final_dropoutr   r  
vocab_sizer6   rl   r   r  r  r   r   lm_headr  )ri   rj   r  r  rl   s       r*   r]   zWavLMForCTC.__init__  s       ''
z&"677&$H H H H   *1)G)GvFL^vF%%djdv 	 y!3V5FGG 	r,   c                    | j         }|)t          | j        dd          t          d| d          |2t          | j        dd          t                              d           dS ||                     |d           dS dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  r  rj   r6   loggerinfoload_adapter)ri   r  s     r*   tie_weightszWavLMForCTC.tie_weights  s     &"wt{<NPT'U'U']u;uuuvvv WT[:Ld%S%S%_KKCDDDDD$kd;;;;; %$r,   c                 b    t          j        dt                     |                                  dS r  r  Nr  r  s    r*   r  z$WavLMForCTC.freeze_feature_extractor  r  r,   c                 B    | j         j                                         dS r  r  r  r   r  s    r*   r  z"WavLMForCTC.freeze_feature_encoder"  !    
 	
$7799999r,   c                 L    | j                                         D ]	}d|_        
dS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr  r   r   r   s     r*   freeze_base_modelzWavLMForCTC.freeze_base_model)  6    
 Z**,, 	( 	(E"'E	( 	(r,   )r  r  r  r  expected_lossr   r   r   r|  r}  labelsr    c           
      p   ||n| j         j        }|>|                                | j         j        k    rt	          d| j         j                   |                     |||||          }|d         }|                     |          }|                     |          }	d}
|Z||nt          j	        |t          j
                  }|                     |                    d                                        t          j
                  }|dk    }|                    d          }|                    |          }t          j                            |	dt          j                                      dd          }t          j        j                            d	
          5  t          j                            ||||| j         j        | j         j        | j         j                  }
ddd           n# 1 swxY w Y   |s|	f|t6          d         z   }|
|
f|z   n|S t9          |
|	|j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r3   r-   )r   r4   r   F)enabled)blank	reductionzero_infinitylosslogitsrp   rw  )rj   r  r%   r!  r6   r  r   r"  r   	ones_liker#  r  r;   r%  masked_selectr   r  log_softmaxfloat32r   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rp   rw  )ri   r   r   r   r|  r}  r5  r^  rp   r=  r<  rM   labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r*   rq   zWavLMForCTC.forward1  s   0 &1%<kk$+B]&**,,$+2H"H"H\DKDZ\\]]]**)/!5#  
 
  
]33m,, #1"<%/R^fkfpBqBqBq  !AA.BTBTUWBXBXYY\\]b]ghhM !A+K(__R00N & 4 4[ A A 11&b1VV``abdeffI%++E+:: 	 	}--%!"+2"k<"&+"? .  	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  	FY)F)G)G!HHF)-)9TGf$$vEfG4IV]Vh
 
 
 	
s    AG11G58G5rn   r  )rt   ru   rv   r   rH  r]   r)  r  r  r2  r   r  r   r  r   r  _CTC_EXPECTED_OUTPUT_CTC_EXPECTED_LOSSr   r;  r@   r   r   rq   rw   rx   s   @r*   r  r    sz        HSM      .< < <*
& 
& 
&: : :( ( ( +*+ABB&"$,(   26,0/3&*)-D
 D
u|,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
 D
  CBD
 D
 D
 D
 D
r,   r  z
    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                   "    e Zd Z fdZd Zd Zd Z ee           e	e
eed          	 	 	 	 	 ddeej                 d	eej                 d
ee         dee         dee         deej                 deeef         fd                        Z xZS )WavLMForSequenceClassificationc                    t                                          |           t          |d          r|j        rt	          d          t          |          | _        |j        dz   }|j        r.t          j
        t          j        |          |z            | _        t          j        |j        |j                  | _        t          j        |j        |j                  | _        |                                  d S )Nr  z\Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r\   r]   r   r  r6   r  r  rl  use_weighted_layer_sumr   r   r   rE   layer_weightsr   r   classifier_proj_size	projector
num_labels
classifierr  ri   rj   
num_layersrl   s      r*   r]   z'WavLMForSequenceClassification.__init__  s       6=)) 	f.@ 	n    ''
-1
( 	S!#ej.D.Dz.Q!R!RD6#5v7RSS)F$?ARSS 	r,   c                 b    t          j        dt                     |                                  dS r  r  r  s    r*   r  z7WavLMForSequenceClassification.freeze_feature_extractor  r  r,   c                 B    | j         j                                         dS r  r-  r  s    r*   r  z5WavLMForSequenceClassification.freeze_feature_encoder  r.  r,   c                 L    | j                                         D ]	}d|_        
dS r0  r1  r   s     r*   r2  z0WavLMForSequenceClassification.freeze_base_model  r3  r,   r  )r  r  r  r  Nr   r   r   r|  r}  r5  r    c                    ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }||                    d          }
nh|                     |j        d         |          }d|| <   |                    d          |                    d                              dd          z  }
|                     |
          }d}|Kt%                      } ||                    d| j         j                  |                    d                    }|s|f|t          d         z   }||f|z   n|S t)          |||j        |j        	          S )
  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   r-   r   r   r;  )rj   r  rT  r  rI  r   stackr   r  r  rU  r   r;   rW  r  r  r   rY  r   rX  r   rp   rw  )ri   r   r   r   r|  r}  r5  r^  rp   norm_weightspooled_outputpadding_maskr=  r<  loss_fctrN  s                   r*   rq   z&WavLMForSequenceClassification.forward  s&   0 &1%<kk$+B]'+{'IcttOc**)/!5#  
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM}55!)..1.55MMBB=CVWXCY[ijjL+.M<-()--!-44|7G7GA7G7N7N7S7STVXY7Z7ZZM//'))H8FKKDK,BCCV[[QS__UUD 	FY)F)G)G!HHF)-)9TGf$$vE'!/)	
 
 
 	
r,   r  )rt   ru   rv   r]   r  r  r2  r   r  r   r  r   r  r   r   r;  r@   r   r   rq   rw   rx   s   @r*   rR  rR    sD           $
& 
& 
&: : :( ( ( +*+ABB&,$	   26,0/3&*)-;
 ;
u|,;
 !.;
 $D>	;

 'tn;
 d^;
 &;
 
u..	/;
 ;
 ;
  CB;
 ;
 ;
 ;
 ;
r,   rR  za
    WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
    c                   $    e Zd Z fdZd Zd Zd Z ee           e	e
eede          	 	 	 	 	 ddeej                 d	eej                 d
eej                 dee         dee         dee         deeef         fd                        Z xZS ) WavLMForAudioFrameClassificationc                    t                                          |           t          |d          r|j        rt	          d          t          |          | _        |j        dz   }|j        r.t          j
        t          j        |          |z            | _        t          j        |j        |j                  | _        |j        | _        |                                  d S )Nr  z_Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r\   r]   r   r  r6   r  r  rl  rT  r   r   r   rE   rU  r   r   rX  rY  init_weightsrZ  s      r*   r]   z)WavLMForAudioFrameClassification.__init__  s       6=)) 	f.@ 	q    ''
-1
( 	S!#ej.D.Dz.Q!R!RD)F$68IJJ +r,   c                 b    t          j        dt                     |                                  dS r+  r  r  s    r*   r  z9WavLMForAudioFrameClassification.freeze_feature_extractor  r  r,   c                 B    | j         j                                         dS r  r-  r  s    r*   r  z7WavLMForAudioFrameClassification.freeze_feature_encoder!  r.  r,   c                 L    | j                                         D ]	}d|_        
dS r0  r1  r   s     r*   r2  z2WavLMForAudioFrameClassification.freeze_base_model(  r3  r,   r  r  Nr   r   r5  r   r|  r}  r    c           	         ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }
d}|`t                      } ||
                    d| j                  t          j        |                    d| j                  d                    }|s|
f|t          d         z   }|S t#          ||
|j        |j        	          S )
r`  NTr  r   r   r-   r   )axisr;  )rj   r  rT  r  rI  r   ra  r   r  r  rU  r   r;   rY  r   rX  r  r   rp   rw  )ri   r   r   r5  r   r|  r}  r^  rp   rb  r=  r<  re  rN  s                 r*   rq   z(WavLMForAudioFrameClassification.forward0  s   0 &1%<kk$+B]'+{'IcttOc**)/!5#  
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM//'))H8FKKDO<<el6;;WY[_[jKkKkrs>t>t>tuuD 	Y)F)G)G!HHFM$!/)	
 
 
 	
r,   r  )rt   ru   rv   r]   r  r  r2  r   r  r   _FRAME_CLASS_CHECKPOINTr   r  _FRAME_EXPECTED_OUTPUTr   r   r;  r@   r   r   rq   rw   rx   s   @r*   rg  rg    sG            
& 
& 
&: : :( ( ( +*+ABB*)$.   26)-,0/3&*3
 3
u|,3
 !.3
 &	3

 $D>3
 'tn3
 d^3
 
u++	,3
 3
 3
  CB3
 3
 3
 3
 3
r,   rg  c                   &     e Zd Zd fd	Zd Z xZS )AMSoftmaxLoss      >@皙?c                    t          t          |                                            || _        || _        || _        t          j        t          j	        ||          d          | _
        t          j                    | _        d S )NT)r   )r\   rr  r]   scalemarginrX  r   r   r   randnr   r   r<  )ri   	input_dimrX  rv  rw  rl   s        r*   r]   zAMSoftmaxLoss.__init__p  sk    mT""++---
$l5;y*#E#EUYZZZ'))			r,   c                    |                                 }t          j                            | j        d          }t          j                            |d          }t          j        ||          }|| j        z
  }t          j                            || j	                  }| j
        t          j        |                                ||          z  }|                     ||          }|S )Nr   r   r   )flattenr   r  	normalizer   r   mmrw  one_hotrX  rv  r4  r@   r<  )	ri   rp   r5  r   	cos_thetapsionehotr=  r<  s	            r*   rq   zAMSoftmaxLoss.forwardx  s    !!((!(<<//1/EEH]F33	$+%&&vt??ek&++--iHHHyy((r,   )rs  rt  rs   rx   s   @r*   rr  rr  o  sL        * * * * * *      r,   rr  c                   D     e Zd Zd fd	Zdej        dej        fdZ xZS )	TDNNLayerr   c                    t                                                       |dk    r|j        |dz
           n|j        |         | _        |j        |         | _        |j        |         | _        |j        |         | _        t          j
        | j        | j        z  | j                  | _        t          j                    | _        d S )Nr   r   )r\   r]   tdnn_dimr_   r`   tdnn_kernelrY   tdnn_dilationdilationr   r   kernelReLUrg   rh   s      r*   r]   zTDNNLayer.__init__  s    <DqLL6?8a<88fo^fNg"OH5!-h7,X6i 043C CTEVWW'))r,   rp   r    c                    t                      r/ddlm} t          | j        |          rt          j        d           |                    dd          }| j        j        	                    | j
        | j        | j                                      dd          }t          j                            ||| j        j        | j                  }|                    dd          }|                     |          }|S )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r   r   )r  )r   peft.tuners.lorar  rF  r  r   r   r   r   r   r`   rY   r_   r   r  conv1dr[   r  rg   )ri   rp   r  r   s       r*   rq   zTDNNLayer.forward  s     	222222$+y11 O   &//155#(():D<LdN^__iijkmnoo,,]FDKDT_c_l,mm%//15566r,   rr   )rt   ru   rv   r]   r   r;  rq   rw   rx   s   @r*   r  r    sc        $ $ $ $ $ $U\ el        r,   r  zi
    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                   J    e Zd Z fdZd Zd Zd Zdeej	        e
f         fdZ ee           eeeede          	 	 	 	 	 dd
eej                 deej                 dee         dee         dee         deej                 deeef         fd                        Z xZS )WavLMForXVectorc                    t                                                     t                    | _        j        dz   }j        r.t          j        t          j	        |          |z            | _
        t          j        j        j        d                   | _        fdt          t!          j                            D             }t          j        |          | _        t          j        j        d         dz  j                  | _        t          j        j        j                  | _        t-          j        j                  | _        |                                  d S )Nr   r   c                 0    g | ]}t          |          S r/   )r  r   s     r*   r2   z,WavLMForXVector.__init__.<locals>.<listcomp>  s#    QQQy++QQQr,   r-   r   )r\   r]   r  r  rl  rT  r   r   r   rE   rU  r   r   r  rW  r>   rC   r   tdnnxvector_output_dimr  rY  rr  rX  	objectiveri  )ri   rj   r[  tdnn_layersrl   s    `  r*   r]   zWavLMForXVector.__init__  s)      ''
-1
( 	S!#ej.D.Dz.Q!R!RD6#5vq7IJJQQQQU3v;O;O5P5PQQQM+..	!#6?2+>+BFD]!^!^)F$=v?XYY&v'@&BSTTr,   c                 b    t          j        dt                     |                                  dS r+  r  r  s    r*   r  z(WavLMForXVector.freeze_feature_extractor  r  r,   c                 B    | j         j                                         dS r  r-  r  s    r*   r  z&WavLMForXVector.freeze_feature_encoder  r.  r,   c                 L    | j                                         D ]	}d|_        
dS r0  r1  r   s     r*   r2  z!WavLMForXVector.freeze_base_model  r3  r,   rM   c                 D    d }| j         j        D ]} |||d          }|S )z?
        Computes the output length of the TDNN layers
        c                     | |z
  |z  dz   S )Nr   r/   r  s      r*   r  zBWavLMForXVector._get_tdnn_output_lengths.<locals>._conv_out_length  s     !;.69A==r,   r   )rj   r  )ri   rM   r  rY   s       r*   _get_tdnn_output_lengthsz(WavLMForXVector._get_tdnn_output_lengths  sE    
	> 	> 	>
  ;2 	L 	LK,,]KKKMMr,   r  r  Nr   r   r   r|  r}  r5  r    c                 >   ||n| j         j        }| j         j        rdn|}|                     |||||          }| j         j        rx|t                   }t          j        |d          }t          j        	                    | j
        d          }	||	                    ddd          z                      d          }n|d         }|                     |          }| j        D ]}
 |
|          }|-|                    d          }|                    d          }n|                     |                    d                    }|                     |          }g }g }t'          |          D ]k\  }}|                    ||d|f                             d                     |                    ||d|f                             d                     lt          j        |          }t          j        |          }t          j        ||gd          }|                     |          }|                     |          }d}||                     ||          }|s||f|t          d         z   }||f|z   n|S t3          ||||j        |j                  S )	r`  NTr  r   r   r-   r   )r<  r=  
embeddingsrp   rw  )rj   r  rT  r  rI  r   ra  r   r  r  rU  r   r;   rW  r  r  r  r  r  rx  rG   r  r  rY  r  r   rp   rw  )ri   r   r   r   r|  r}  r5  r^  rp   rb  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsr   lengthstatistic_poolingoutput_embeddingsr=  r<  rN  s                         r*   rq   zWavLMForXVector.forward  s   0 &1%<kk$+B]'+{'IcttOc**)/!5#  
 
 ;- 	'#$ABM!K1===M=001C0LLL*\->->r1a-H-HHMMRSMTTMM#AJM}55) 	6 	6J&J}55MM !)..1.55M(,,,33LL*.*O*OP^PbPbghPbPiPi*j*j'"&"?"?@["\"\ML&':;; J J	6$$]1gvg:%>%C%C%C%J%JKKK##M!WfW*$=$A$Aa$A$H$HIIII!K66M ;|44L!I}l&CLLL 223DEE!233>>&&11D 	F/07;X;Y;Y3ZZF)-)9TGf$$vE(!/)
 
 
 	
r,   r  )rt   ru   rv   r]   r  r  r2  r   r   r=  r$   r  r   r  r   _XVECTOR_CHECKPOINTr   r  _XVECTOR_EXPECTED_OUTPUTr   r;  r@   r   rq   rw   rx   s   @r*   r  r    s~           &
& 
& 
&: : :( ( (eE<Lc<Q6R     +*+ABB&!$0   26,0/3&*)-I
 I
u|,I
 !.I
 $D>	I

 'tnI
 d^I
 &I
 
um#	$I
 I
 I
  CBI
 I
 I
 I
 I
r,   r  r   )Vr   r1  r   typingr   r   r   numpyr7   r   torch.nn.functionalr   r  r  torch.utils.checkpointtorch.nnr   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   r   r   r   r   r   configuration_wavlmr   
get_loggerrt   r&  rI  r  r  r  rO  rP  ro  rp  r  r  r$   r0  r=  ndarrayrT   ModulerV   rz   r   r   r   r   r   r   r   r@  rO  rb  rf  r  r  r  r  r  WAVLM_START_DOCSTRINGr  r  r  rR  rg  rr  r  r  r/   r,   r*   <module>r     s       ) ) ) ) ) ) ) ) ) )                         % % % % % % ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7                . - - - - -              - , , , , , 
	H	%	% !"    J &  t   9 Q  5   26t tc?tt t U-.	t
 t Zt t t tp    	   ,    bi   8    bi   2* * * * *29 * * *\    	   ) ) ) ) )") ) ) )X
 
 
 
 
/ 
 
 
1 1 1 1 1RY 1 1 1c  c  c  c  c RY c  c  c N    ry   0& & & & &	 & & &R" " " " "ry " " "JO
 O
 O
 O
 O
29 O
 O
 O
dP
 P
 P
 P
 P
") P
 P
 P
fC' C' C' C' C' C' C' C'N    29   @    	   $U U U U U? U U Up (" J e 
P
 P
 P
 P
 P
% P
 P
 
P
f m 
T
 T
 T
 T
 T
& T
 T
 
T
n   s
 s
 s
 s
 s
%9 s
 s
 s
l  	 g
 g
 g
 g
 g
'; g
 g
 g
V    BI   0    	   >  	 O
 O
 O
 O
 O
* O
 O
 O
 O
 O
r,   