
    g\                    4   d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlmc mZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'  e!j(        e)          Z*dZ+d Z,d Z-d Z.d[dZ/dej0        dej0        fdZ1e G d de                      Z2e G d de                      Z3e G d de                      Z4 G d dej5                  Z6 G d dej5                  Z7 G d  d!ej5                  Z8 G d" d#ej5                  Z9 G d$ d%ej5                  Z: G d& d'ej5                  Z; G d( d)ej5                  Z< G d* d+ej5                  Z= G d, d-ej5                  Z> G d. d/ej5                  Z? G d0 d1ej5                  Z@ G d2 d3ej5                  ZAd4ZBd5ZCd6ZDd7ZE G d8 d9ej5                  ZF G d: d;ej5                  ZG G d< d=ej5                  ZH G d> d?ej5                  ZId@eHiZJ G dA dBej5                  ZK G dC dDej5                  ZL G dE dFej5                  ZM G dG dHej5                  ZN G dI dJej5                  ZO G dK dLej5                  ZP G dM dNe          ZQ G dO dPeQ          ZR G dQ dReQ          ZS eeB           G dS dTeQ                      ZT edUeB           G dV dWeQ                      ZU edXeB           G dY dZeQ                      ZVdS )\zPyTorch CLAP model.    N)	dataclass)AnyListOptionalTupleUnion)nn   )ACT2FN))BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigzlaion/clap-htsat-fusedc                     | j         \  }}}| dddddddf                             dd|d          }|                    |||z  |          }|S )ae  
    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            Input hidden states
        ratio (`int`):
            The ratio of the length of the output to the length of the input.
    Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         b/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/clap/modeling_clap.pyinterpolater)   3   sg     .;-@*ZkaaaD!!!m,33Aq%CCI!!*kE.A;OOI    c                     | j         \  }}}}|                     |||z  |||z  ||          } |                     dddddd                                                              d|||          }|S )aR  
    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
    num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            Input hidden states
        window_size (`int`):
            Window size
    r   r   r
            r   viewpermute
contiguous)r"   window_sizer$   heightwidthnum_channelswindowss          r(   window_partitionr9   D   s     /<.A+J|!&&Fk);8Lk[g M ##Aq!Q155@@BBGGKYdfrssGNr*   c                     | j         d         }|                     d||z  ||z  |||          } |                     dddddd                                                              d|||          } | S )a  
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    r/   r   r   r
   r,   r-   r.   r0   )r8   r4   r5   r6   r7   s        r(   window_reverser;   Y   sx     =$Lll2v4e{6JKYdfrssGooaAq!Q//::<<AA"feUabbGNr*   c                     |                      |                                          }t          j        |d                              |          |z   |z  }|                                |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxpast_key_values_lengthmaskincremental_indicess        r(   "create_position_ids_from_input_idsrJ   m   sg     <<$$((**D <!444<<TBBE[[_cc##%%33r*   logitsreturnc                     t          j        t          |           | j                  }t          j                            | |          S )Ndevice)rA   arangelenrO   r	   
functionalcross_entropy)rK   labelss     r(   contrastive_lossrU      s6    \#f++fm<<<F=&&vv666r*   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )ClapTextModelOutputa  
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Ntext_embedslast_hidden_state.r"   
attentions)__name__
__module____qualname____doc__rX   r   rA   FloatTensor__annotations__rY   r"   r   rZ    r*   r(   rW   rW      s          * 04K%+,333+/u(///=AM8E%"3S"89:AAA:>Ju0#567>>>>>r*   rW   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
eeej        df                  ed<   dZeeej        df                  ed<   dS )ClapAudioModelOutputak  
    ClapAudio model output to mimic the output of the original implementation.

    Args:
        audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            The Audio embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Naudio_embedsrY   .r"   rZ   )r[   r\   r]   r^   rd   r   rA   r_   r`   rY   r"   r   rZ   ra   r*   r(   rc   rc      s          * 15L(5,-444+/u(///=AM8E%"3S"89:AAA:>Ju0#567>>>>>r*   rc   c                       e Zd ZU dZdZeej                 ed<   dZ	ej        ed<   dZ
ej        ed<   dZej        ed<   dZej        ed<   dZeed<   dZeed	<   d
ee         fdZdS )
ClapOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for audio-text similarity.
        logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
            The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
            The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
        audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`ClapTextModel`].
        audio_model_output (`BaseModelOutputWithPooling`):
            The output of the [`ClapAudioModel`].
    Nlosslogits_per_audiologits_per_textrX   rd   text_model_outputaudio_model_outputrL   c                 ^     t           fd                                 D                       S )Nc              3   t   K   | ]2}|d vr|         n!t          |                                          V  3dS ))rj   rk   N)getattrto_tuple).0kselfs     r(   	<genexpr>z&ClapOutput.to_tuple.<locals>.<genexpr>   sc       
 
  KKKDGGQXY]_`QaQaQjQjQlQl
 
 
 
 
 
r*   )tuplekeysrr   s   `r(   ro   zClapOutput.to_tuple   sC     
 
 
 
YY[[
 
 
 
 
 	
r*   )r[   r\   r]   r^   rg   r   rA   r_   r`   rh   ri   rX   rd   rj   r   rk   r   r   ro   ra   r*   r(   rf   rf      s          ( )-D(5$
%,,,*.e'...)-OU&---%)K")))&*L%#***481888592999
%* 
 
 
 
 
 
r*   rf   c                   *     e Zd ZdZd fd	Zd Z xZS )ClapDropPathz
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    Nc                 V    t                                                       || _        d S N)super__init__	drop_prob)rr   r}   	__class__s     r(   r|   zClapDropPath.__init__   s$    "r*   c                    | j         dk    s| j        s|S d| j         z
  }|j        d         fd|j        dz
  z  z   }|t	          j        ||j        |j                  z   }|                                 |	                    |          |z  }|S )N        r   r   )r   dtyperO   )
r}   trainingr   ndimrA   randr   rO   floor_div)rr   r"   	keep_probr   random_tensoroutputs         r(   forwardzClapDropPath.forward   s    >S     &	$Q')DM4F4J,KK!EJuM<OXeXl$m$m$mm""9--=r*   rz   )r[   r\   r]   r^   r|   r   __classcell__r~   s   @r(   rx   rx      sV         
# # # # # #      r*   rx   c                   .     e Zd ZdZdef fdZd Z xZS )ClapAudioAFFBlockz
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    configc                    t                                                       |j        }|j        }t	          ||z            }t          j        t          j        ||ddd          t          j        |          t          j	        d          t          j        ||ddd          t          j        |                    | _
        t          j        t          j        d          t          j        ||ddd          t          j        |          t          j	        d          t          j        ||ddd          t          j        |                    | _        t          j                    | _        d S )Nr   r   kernel_sizestridepaddingT)inplace)r{   r|   patch_embeds_hidden_sizeaff_block_rr@   r	   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)rr   r   channelsdownsize_ratiointer_channelsr~   s        r(   r|   zClapAudioAFFBlock.__init__  s<   2+X788IhAaQRSSSN>**GD!!!InhAaQRSSSN8$$
 
 - ##IhAaQRSSSN>**GD!!!InhAaQRSSSN8$$
 
 z||r*   c                     ||z   }|                      |          |                     |          z   }|                     |          }d|z  |z  d|z  d|z
  z  z   }|S )Nr,   r   )r   r   r   )rr   r"   residualattention_inputfused_layer_outputr   s         r(   r   zClapAudioAFFBlock.forward  sk    '(2!^^O<<t?_?__!\\*<==]"%77!h,!N`J`:aar*   r[   r\   r]   r^   r   r|   r   r   r   s   @r(   r   r      s]         
$ $ $ $ $ $ $0      r*   r   c                   0     e Zd ZdZdef fdZddZ xZS )ClapAudioPatchEmbedz
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    r   c                 L   t                                                       t          |j        t                    r|j        |j        fn|j        }t          |j        t                    r|j        |j        fn|j        }t          |j        t                    r|j        |j        fn|j        }|| _        || _        |d         |d         z  |d         |d         z  f| _        | j        d         | j        d         z  | _	        |j
        | _        |j        | _        |d         |d         z
  dz  |d         |d         z
  dz  f}| j        r|j        dk    rdnd}t          j        |j        |z  |j        |||          | _        |j        rt          j        |j                  nt          j                    | _        | j        r`t/          |          | _        t          j        |j        |j        |d         |d         dz  f|d         |d         dz  f|          | _        d S d S )Nr   r   r,   channel_mapr-   r   r
   )r{   r|   
isinstance	spec_sizer@   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer	   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)rr   r   r   r   r   r   scale_factorr~   s          r(   r|   zClapAudioPatchEmbed.__init__-  s8   ;EfFVX[;\;\rF$f&677bhbr6@ARTW6X6XoV 122^d^o 	 ;EVEXZ]:^:^wV &"566djdw 	 !("1+a8(1+VW:XY>!,t~a/@@2#1qMLO39JqMLYZO<[`a;ab!/af6HM6Y6Yqq`aI-<+"
 
 
	 FLEcvBL!@AAAikitiviv	 	 1& 9 9D i1/']JqMA,=>$Qa1)<=  DOOO	 	r*   Nc                 2   | j         r|d d ddd d d d f         }|j        \  }}}}|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                     |          }|                    d          }t          |          dk    r||dd d d d d f                                         }	|	j        \  }}}}|	                    ||z  d||          }	| 	                    |	          }	|	j        \  }
}}}|	                    |||||          }	|	
                    d                                                              d	          }	|	                    d          }t          j        j                            |	d||z
  fd
d          }	|                     ||         |	          ||<   |}nu|j        \  }
}
}}|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                     |          }| j        r)|                    d                              dd          }|                     |          }|S )Nr   r   zInput audio size (*z) doesn't match model (z).r/   )r   r,   r
   r   r-   r
   constantr,   )r   r   r   
ValueErrorr   sizerQ   r3   r1   r   r2   r   rA   r	   rR   padr   	transposer   )rr   r"   is_longer_idxglobal_hidden_statesr$   r7   r5   r6   output_widthlocal_hidden_states_featureslocal_widths                r(   r   zClapAudioPatchEmbed.forwardW  s
    )	5#0AaCAAA#>  7K6P3Jfeq)))UdmA6F-F-F www%wwPTP]^_P`wwcgcpqrcswww   $(99-A#B#B /44R88L=!!A%%&3M122qqq!!!4K&L&W&W&Y&Y#:M:S7
L&%&9&>&>zL?XZ[]cej&k&k#&*oo6I&J&J#-@-F*8VU&9&>&>z<Yacikp&q&q#&9&A&A/&R&R&]&]&_&_&g&ghi&j&j#166r::&+h&9&=&='!\K-G)H*VW' '# 7;6G6G(79L7 7$]3 1MM"/"5Aq&%q)))UdmA6F-F-F www%wwPTP]^_P`wwcgcpqrcswww   !IIm44M< 	E)11!44>>q!DDM		-00r*   rz   r   r   s   @r(   r   r   '  sc         
( ( ( ( ( ( (T/ / / / / / / /r*   r   c                        e Zd Z fdZd Z	 	 	 ddej        deej                 deej                 dee	         d	e
ej                 f
d
Z xZS )ClapAudioSelfAttentionc                    t                                                       ||z  dk    rt          d| d| d          || _        t	          ||z            | _        | j        | j        z  | _        t          |t          j	        j
                  r|n||f| _        t          j        t          j        d| j        d         z  dz
  d| j        d         z  dz
  z  |                    | _        t          j        | j        d                   }t          j        | j        d                   }t          j        t'          ||gd                    }t          j        |d          }|d d d d d f         |d d d d d f         z
  }	|	                    ddd                                          }	|	d d d d dfxx         | j        d         dz
  z  cc<   |	d d d d dfxx         | j        d         dz
  z  cc<   |	d d d d dfxx         d| j        d         z  dz
  z  cc<   |	                    d	          }
|                     d
|
           t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        |j                  | _         d S )Nr   The hidden size (6) is not a multiple of the number of attention heads ()r,   r   ij)indexingr/   relative_position_indexbias)!r{   r|   r   num_attention_headsr@   attention_head_sizeall_head_sizer   collectionsabcIterabler4   r	   	ParameterrA   zerosrelative_position_bias_tablerP   stackr   r   r2   r3   sumregister_bufferLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropout)rr   r   r>   	num_headsr4   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r~   s              r(   r|   zClapAudioSelfAttention.__init__  s   ?akCkk_hkkk   $- #&sY#7#7 !58PP%k;?3KLLlKKS^`kRl 	 -/LKT-a0014T=Ma=P9PST9TUW`aa-
 -
)
 < 0 344< 0 344Xx&:TJJJKKvq11(AAAt4~aaaqqqj7QQ)11!Q::EEGG111a   D$4Q$7!$;;   111a   D$4Q$7!$;;   111a   A(8(;$;a$??   "1"5"5b"9"968OPPPYt143EFO\\\
9T/1C&/ZZZYt143EFO\\\
z&"EFFr*   c                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S Nr/   r   r,   r   r
   r   r   r   r1   r2   rr   xnew_x_shapes      r(   transpose_for_scoresz+ClapAudioSelfAttention.transpose_for_scores  P    ffhhssmt'?AY&ZZFF;yyAq!$$$r*   NFr"   attention_mask	head_maskoutput_attentionsrL   c                 f   |j         \  }}}|                     |          }|                     |                     |                    }	|                     |                     |                    }
|                     |          }t          j        ||	                    dd                    }|t          j	        | j
                  z  }| j        | j                            d                   }|                    | j        d         | j        d         z  | j        d         | j        d         z  d          }|                    ddd                                          }||                    d          z   }|v|j         d         }|                    ||z  || j        ||          }||                    d                              d          z   }|                    d| j        ||          }t&          j                            |d          }|                     |          }|||z  }t          j        ||
          }|                    dddd                                          }|                                d d         | j        fz   }|                    |          }|r||fn|f}|S )Nr/   r   r   r,   r=   r
   )r   r   r   r   r   rA   matmulr   mathsqrtr   r   r   r1   r4   r2   r3   	unsqueezer   r	   rR   softmaxr   r   r   )rr   r"   r   r   r   r$   r>   r7   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r(   r   zClapAudioSelfAttention.forward  s    )6(;%
C JJ}55--dhh}.E.EFF	//

=0I0IJJ//0ABB !<Y5H5HR5P5PQQ+di8P.Q.QQ!%!B4C_CdCdegChCh!i!7!<!<Q$"21"55t7G7JTM]^_M`7`bd"
 "
 "8!?!?1a!H!H!S!S!U!U+.D.N.Nq.Q.QQ%'-a0J/44j(*d6NPSUX     0.2J2J12M2M2W2WXY2Z2ZZ/44R9QSVX[\\ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]r*   NNF)r[   r\   r]   r|   r   rA   Tensorr   r_   boolr   r   r   r   s   @r(   r   r     s        #G #G #G #G #GJ% % % 7;15,16 6|6 !!236 E-.	6
 $D>6 
u|	6 6 6 6 6 6 6 6r*   r   c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ClapAudioSelfOutputc                     t                                                       t          j        ||          | _        t          j        |j                  | _        d S rz   )r{   r|   r	   r   denser   r   r   rr   r   r>   r~   s      r(   r|   zClapAudioSelfOutput.__init__  sD    YsC((
z&"EFFr*   r"   input_tensorrL   c                 Z    |                      |          }|                     |          }|S rz   r  r   rr   r"   r  s      r(   r   zClapAudioSelfOutput.forward  s*    

=11]33r*   r[   r\   r]   r|   rA   r  r   r   r   s   @r(   r  r    sn        G G G G G
U\  RWR^        r*   r  c                        e Zd Z fdZd Z	 	 	 ddej        deej                 deej                 dee	         d	e
ej                 f
d
Z xZS )ClapAudioAttentionc                     t                                                       t          ||||          | _        t	          ||          | _        t                      | _        d S rz   )r{   r|   r   rr   r  r   setpruned_heads)rr   r   r>   r   r4   r~   s        r(   r|   zClapAudioAttention.__init__  sQ    *63	;OO	)&#66EEr*   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S Nr   r   r=   rQ   r   rr   r   r   r#  r   r   r   r   r   r  r   unionrr   headsindexs      r(   prune_headszClapAudioAttention.prune_heads      u::??F7490$)2OQUQb
 
u
 -TY_eDD	*49=%@@	,TY_eDD	.t{/@%QOOO )-	(EE

(R	%"&)"?$)B_"_	 -33E::r*   NFr"   r   r   r   rL   c                     |                      ||||          }|                     |d         |          }|f|dd          z   }|S Nr   r   rr   r   )rr   r"   r   r   r   self_outputsattention_outputr  s           r(   r   zClapAudioAttention.forward  sO     yy	K\]];;|AFF#%QRR(88r*   r  )r[   r\   r]   r|   r+  rA   r  r   r_   r  r   r   r   r   s   @r(   r   r     s        " " " " "; ; ;* 7;15,1
 
|
 !!23
 E-.	

 $D>
 
u|	
 
 
 
 
 
 
 
r*   r   c                   B     e Zd Z fdZdej        dej        fdZ xZS )ClapAudioIntermediatec                 $   t                                                       t          j        |t	          |j        |z                      | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rz   )r{   r|   r	   r   r@   	mlp_ratior  r   
hidden_actstrr   intermediate_act_fnr  s      r(   r|   zClapAudioIntermediate.__init__%  sx    YsC(83(>$?$?@@
f'-- 	9'-f.?'@D$$$'-'8D$$$r*   r"   rL   c                 Z    |                      |          }|                     |          }|S rz   r  r8  rr   r"   s     r(   r   zClapAudioIntermediate.forward-  ,    

=1100??r*   r  r   s   @r(   r3  r3  $  ^        9 9 9 9 9U\ el        r*   r3  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ClapAudioOutputc                     t                                                       t          j        t	          |j        |z            |          | _        t          j        |j                  | _	        d S rz   )
r{   r|   r	   r   r@   r5  r  r   hidden_dropout_probr   r  s      r(   r|   zClapAudioOutput.__init__5  sT    Ys6#3c#9::C@@
z&"<==r*   r"   rL   c                 Z    |                      |          }|                     |          }|S rz   r  r;  s     r(   r   zClapAudioOutput.forward:  s*    

=11]33r*   r  r   s   @r(   r?  r?  4  s^        > > > > >
U\ el        r*   r?  c                        e Zd Zd fd	Zd Zd Zd Z	 	 	 ddej        d	e	e
e
f         d
eej                 dee         dee         de	ej        ej        f         fdZ xZS )ClapAudioLayerr   c                    t                                                       |j        | _        || _        |j        | _        || _        t          j        ||j                  | _	        t          |||| j                  | _        |j        dk    rt          |j                  nt          j                    | _        t          j        ||j                  | _        t#          ||          | _        t'          ||          | _        d S )Neps)r4   r   )r{   r|   chunk_size_feed_forward
shift_sizer4   input_resolutionr	   r   layer_norm_epslayernorm_beforer   	attentiondrop_path_raterx   r   	drop_pathlayernorm_afterr3  intermediater?  r   )rr   r   r>   rJ  r   rI  r~   s         r(   r|   zClapAudioLayer.__init__B  s    '-'E$$!- 0 "Sf6K L L L+FCPTP`aaa@F@UX[@[@[f&;<<<acalanan!|CV5JKKK1&#>>%fc22r*   c                    t          |          | j        k    rnt          d          | _        t          j                                        r&t	          j         t	          j        |                    nt          |          | _        d S d S Nr   )minr4   r   rI  rA   jit
is_tracingtensor)rr   rJ  s     r(   set_shift_and_window_sizez(ClapAudioLayer.set_shift_and_window_sizeO  sv      D$444'llDO=BY=Q=Q=S=Sn	%,'788999Y\]mYnYn  54r*   c           	         | j         dk    rwt          j        d||df||          }t          d| j                   t          | j         | j                    t          | j          d           f}t          d| j                   t          | j         | j                    t          | j          d           f}d}|D ]}	|D ]}
||d d |	|
d d f<   |dz  }t          || j                  }|                    d| j        | j        z            }|                    d          |                    d          z
  }|                    |dk    t          d                                        |dk    t          d                    }nd }|S )Nr   r   r   r/   r,   g      Yr   )
rI  rA   r   slicer4   r9   r1   r  masked_fillfloat)rr   r5   r6   r   rO   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r(   get_attn_maskzClapAudioLayer.get_attn_maskW  s   ?Q{Avua#8fUUUHa$**++t''$/)9::t&--M a$**++t''$/)9::t&--L
 E -  #/  K@EHQQQk111<=QJEE ,Hd6FGGL',,R1ADDT1TUUL$..q11L4J4J14M4MMI!--i1neFmmLLXXYbfgYginorisisttIIIr*   c                     | j         || j         z  z
  | j         z  }| j         || j         z  z
  | j         z  }ddd|d|f}t          j                            ||          }||fS rS  )r4   r	   rR   r   )rr   r"   r5   r6   	pad_right
pad_bottom
pad_valuess          r(   	maybe_padzClapAudioLayer.maybe_pads  sp    %0@(@@DDTT	&$2B)BBdFVV
Ay!Z8
))-DDj((r*   NFr"   input_dimensionsr   r   always_partitionrL   c                    |s|                      |           n	 |\  }}|                                \  }}	}
|}|                     |          }|                    ||||
          }|                     |||          \  }}|j        \  }	}}}	| j        dk    r&t          j        || j         | j         fd          }n|}t          || j
                  }|                    d| j
        | j
        z  |
          }|                     |||j        |j                  }|                     ||||          }|d         }|                    d| j
        | j
        |
          }t          || j
        ||          }| j        dk    r$t          j        || j        | j        fd          }n|}|d         dk    p|d         dk    }|r&|d d d |d |d d f                                         }|                    |||z  |
          }||                     |          z   }|                     |          }|                     |          }||                     |          z   }|r
||d	         fn|f}|S )
Nr   )r   r,   )shiftsdimsr/   r   )r   r
   r.   r   )rX  r   rL  r1   rj  r   rI  rA   rollr9   r4   re  r   rO   rM  r;   r3   rO  rP  rQ  r   )rr   r"   rk  r   r   rl  r5   r6   r$   r   r   shortcutri  
height_pad	width_padshifted_hidden_stateshidden_states_windowsrd  attention_outputsr1  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r(   r   zClapAudioLayer.forwardz  s      	**+;<<<<("/"4"4"6"6
Ax --m<<%**:vuhOO %)NN=&%$P$P!z&3&9#:y!?Q$)J}tFVY]YhXhEipv$w$w$w!!$1! !11FHX Y Y 5 : :2t?ORVRb?bdl m m&&	)<EZEa ' 
 
	 !NN!9iK\ + 
 
 -Q/,11"d6FHXZbcc():D<LjZcdd ?Q %
?DOUYUdCelr s s s /]Q&;*Q-!*;
 	V 1!!!WfWfufaaa2G H S S U U-22:v~xXX 4>>2C#D#DD++M::((66$t{{<'@'@@@Qf'8';<<XdWfr*   r   NFF)r[   r\   r]   r|   rX  re  rj  rA   r  r   r@   r   r_   r  r   r   r   s   @r(   rD  rD  A  s        3 3 3 3 3 3    8) ) ) 26,1+0A A|A  S/A E-.	A
 $D>A #4.A 
u|U\)	*A A A A A A A Ar*   rD  c                        e Zd Z fdZ	 	 	 ddej        deeef         deej	                 dee
         dee
         d	eej                 fd
Z xZS )ClapAudioStagec                 2   t                                                       | _        | _        t	          j        fdt          |          D                       | _        | |t          j                  | _	        nd | _	        d| _
        d S )Nc           
      ^    g | ])}t          |d z  dk    rdn	j        d z            *S )r,   r   )r   r>   rJ  r   rI  )rD  r4   )rp   ir   r>   rJ  r   s     r(   
<listcomp>z+ClapAudioStage.__init__.<locals>.<listcomp>  sa     	 	 	  !%5'%&UaZZqqf6HA6M  	 	 	r*   )r>   
norm_layerF)r{   r|   r   r>   r	   
ModuleListrangeblocksr   
downsamplepointing)	rr   r   r>   rJ  depthr   rO  r  r~   s	    ``` `  r(   r|   zClapAudioStage.__init__  s    m	 	 	 	 	 	 	 u	 	 	
 
 !(j)9sr|\\\DOO"DOr*   NFr"   rk  r   r   rl  rL   c                 *   |\  }}t          | j                  D ](\  }}	|||         nd }
 |	|||
||          }|d         })|}| j        -|dz   dz  |dz   dz  }}||||f}|                     ||          }n||||f}|||f}|r||dd          z  }|S )Nr   r   r,   )	enumerater  r  )rr   r"   rk  r   r   rl  r5   r6   r  layer_modulelayer_head_maskr{  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                    r(   r   zClapAudioStage.forward  s     )(55 	- 	-OA|.7.CillO(L/BSUe M *!,MM,9)?&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_``MM!' >&(IK\] 	/]122..Mr*   r}  )r[   r\   r]   r|   rA   r  r   r@   r   r_   r  r   r   r   s   @r(   r  r    s            : 26,1+0 |  S/ E-.	
 $D> #4. 
u|	       r*   r  c            	            e Zd ZdZej        fdee         dedej        ddf fdZ	d Z
d	ej        d
eeef         dej        fdZ xZS )ClapAudioPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    rJ  r>   r  rL   Nc                     t                                                       || _        || _        t	          j        d|z  d|z  d          | _         |d|z            | _        d S )Nr-   r,   Fr   )r{   r|   rJ  r>   r	   r   	reductionr   )rr   rJ  r>   r  r~   s       r(   r|   zClapAudioPatchMerging.__init__  sa     01s7AG%@@@Jq3w''			r*   c                     |dz  dk    p|dz  dk    }|r.ddd|dz  d|dz  f}t           j                            ||          }|S )Nr,   r   r   )r	   rR   r   )rr   input_featurer5   r6   
should_padri  s         r(   rj  zClapAudioPatchMerging.maybe_pad  s\    qjAo:519>
 	IQ519a!<JM--mZHHMr*   r  rk  c                    |\  }}|j         \  }}}|                    ||||          }|                     |||          }|d d dd ddd dd d f         }|d d dd ddd dd d f         }	|d d dd ddd dd d f         }
|d d dd ddd dd d f         }t          j        ||	|
|gd          }|                    |dd|z            }|                     |          }|                     |          }|S )Nr   r,   r   r/   r-   )r   r1   rj  rA   catr   r  )rr   r  rk  r5   r6   r$   r>   r7   input_feature_0input_feature_1input_feature_2input_feature_3s               r(   r   zClapAudioPatchMerging.forward  sD   ((5(;%
C%**:vulSS}feDD'14a4Aqqq(89'14a4Aqqq(89'14a4Aqqq(89'14a4Aqqq(89	?O_Ve"fhjkk%**:r1|;KLL		-00}55r*   )r[   r\   r]   r^   r	   r   r   r@   Moduler|   rj  rA   r  r   r   r   s   @r(   r  r    s        
 
 XZWc ( (s (# (29 (hl ( ( ( ( ( (  U\ U3PS8_ Y^Ye        r*   r  c                        e Zd Z fdZd Z	 	 	 	 	 	 	 ddeej                 deej                 dee         d	ee         d
ee         dee         dee         de	e
ef         fdZ xZS )ClapAudioEncoderc                     t                                                       t          j                   _         _        t                     _        j         _         j        j	         _	        j
         _
        j
        j        z   _        t          j        d j        dz
  z  z             _        d t!          j        dj        t'          j                            D              j        j        fdt+           j                  D              _        t/          j         fdt+           j                  D                        _        d _        t/          j        j                   _        t/          j         j                   _        j         _        t/          j        d           _         d S )Nr,   r   c                 6    g | ]}|                                 S ra   )item)rp   r   s     r(   r  z-ClapAudioEncoder.__init__.<locals>.<listcomp>?  s     iiiq!&&((iiir*   r   c                 H    g | ]}d          d|z  z  d         d|z  z  fS )r   r,   r   ra   )rp   r  r   s     r(   r  z-ClapAudioEncoder.__init__.<locals>.<listcomp>B  s9    !s!s!sWX9Q<AqD#99Q<AqD;Q"R!s!s!sr*   c                 V   g | ]}t          t          j        d |z  z            j        |         j        |         j        |         t          j        d|                   t          j        d|dz                               |j        dz
  k     rt          nd          S )r,   Nr   )r   r>   rJ  r  r   rO  r  )	r  r@   r   input_resolutionsdepthsr   r   
num_layersr  )rp   i_layerr   rN  rr   s     r(   r  z-ClapAudioEncoder.__init__.<locals>.<listcomp>E  s         !F;ajHII%)%;G%D -0$8A,Sxx1H-I-ICPVP]^k`gjk`k^kPlLmLm-mn9@4?UVCV9V9V44]a    r*   F)!r{   r|   rQ   r  r  r   r   patch_embedr   r   r   num_mel_bins
freq_ratior@   r   num_featuresrA   linspacerN  r   r   r  r  r	   r  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)rr   r   rN  r   r~   s   ``@@r(   r|   zClapAudioEncoder.__init__2  s   fm,,.v66#1 ,9) *f.AA ?!Z[H[B\ \]]iiEN1f>SUXY_YfUgUg,h,hiii$.	!s!s!s!s\abfbq\r\r!s!s!sm       %T_55  
 
 ',#.)<==L!233	m+A..r*   c                 b   |j         \  }}}}t          | j        | j        z            }| j        | j        z  }||k    s||k    rt	          d          ||k     r%t
          j                            |||fdd          }||k     r%t
          j                            |||fdd          }|j         \  }}}	}
|                    ||| j        z  |	| j        z  |
          }|	                    dddd          
                                }|                    |||
| j        z  |	| j        z            }|S )	z
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r
   r,   )r   r@   r   r  r   r	   rR   r)   r!   r2   r3   )rr   normalized_input_featuresr   r%   freq_length
spec_widthspec_heigthbatchr   timefreqs              r(   reshape_mel2imgz ClapAudioEncoder.reshape_mel2imgZ  sr   
 *C)H&1k;$/9::
n7##{['@'@_``` ##(*(A(A)J+D9dh )B ) )% $$(*(A(A)K+EIei )B ) )% '@&E#xt %>$E$E8do-tt/F%
 %
! %>$E$EaAq$Q$Q$\$\$^$^!$=$E$E8TDO3TT_5L%
 %
! )(r*   NFT	is_longerr   r   output_hidden_states(output_hidden_states_before_downsamplingrl  return_dictrL   c	                    |                     dd          }|                     |          }	|	                     dd          }	d }
| j        r8|                    |j                  }t          j        |dk              d         }
|                     |	          }|j        d         }| 	                    ||
          }|rdnd }|rdnd }|rdnd }| j
        d         }|r?|j        \  }}} |j        |g||R  }|                    dddd          }||fz  }||fz  }t          | j                  D ]'\  }}|||         nd }| j
        |         }| j        r&| j        r|                     |j        ||||          }n ||||||          }|d         }|d         }|d         }|d         |d         f}|rP|rN|j        \  }}} |j        |g|d         |d         f|R  }|                    dddd          }||fz  }||fz  }nC|rA|s?|j        \  }}} |j        |g||R  }|                    dddd          }||fz  }||fz  }|r||dd          z  })|                     |          }|j        \  }}}|dt)          | j                  dz
  z  z  | j        d         z  }|dt)          | j                  dz
  z  z  | j        d         z  }|                    ddd                                                              ||||          }|j        \  }}} }!| | j        z  }"|                    ||| |"z  |"|!          }|                    ddddd                                                              |||"d          }|                     t          j        |d                    }#t          j        |#d          }#|st9          d	 ||#||fD                       S t;          ||#||
          S )Nr   r
   r   r,   ra   r  r/   r-   c              3      K   | ]}||V  	d S rz   ra   rp   vs     r(   rs   z+ClapAudioEncoder.forward.<locals>.<genexpr>  s4       	 	 =  !===	 	r*   )rY   pooler_outputr"   rZ   )r   r  r   torO   rA   wherer  r   r  r  r1   r2   r  r  r  r   _gradient_checkpointing_func__call__r   rQ   r  r   r3   r!   r  r  r   rt   r   )$rr   input_featuresr  r   r   r  r  rl  r  r  is_longer_list_idxis_longer_listr"   
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsrk  r$   r   hidden_sizereshaped_hidden_stater  r  r  r{  r  r  rY   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs$                                       r(   r   zClapAudioEncoder.forward~  sH    (11!Q77$(OON$C$C!$=$G$G1$M$M!! 	E&\\.*?@@N!&^q-@!A!A!!D,,-FGG"(+
((8JKK"6@BBD+?%IRRT"$5?bb41!4 	C)6)<&J;$6M$6z$bDT$bVa$b$b$b!$9$A$A!Q1$M$M!-!11&+@*BB&(55 (	9 (	9OA|.7.CillO#5a8* t}  $ A A )=:JO]n! ! !-!#3_FWYi! ! *!,M0=a0@- -a 0 1" 57H7LM# G(P G-N-T*
A{ )O(I(N)"3A"68I!8L!M)OZ) ) )% )>(E(EaAq(Q(Q%!&G%II!*/D.FF**% G.V G-:-@*
A{(:(::(fHX(fZe(f(f(f%(=(E(EaAq(Q(Q%!m%55!*/D.FF*  9#}QRR'88# IIm44$5$;!
AzA#dk*:*:Q*>$?@DDUVWDXX
#c$+.>.>.B(CDHYZ[H\\ %%aA..99;;CCJPZ\fhvww 	 9J8O5
Jv"do5
-55
MZ$?V
 
 %%aAq!44??AAII*V`blnpqq 	 U]3Da%H%HIImQ77 
	 	 	 &!.'		 	 	 	 	 	 */'4*	
 
 
 	
r*   )NNFFFFT)r[   r\   r]   r|   r  r   rA   r_   r  r   r   rc   r   r   r   s   @r(   r  r  1  s	       &/ &/ &/ &/ &/P") ") ")N 2615,1/4CH+0&*z
 z
 E-.z
 E-.	z

 $D>z
 'tnz
 3;4.z
 #4.z
 d^z
 
u**	+z
 z
 z
 z
 z
 z
 z
 z
r*   r  a=  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`ClapConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a6  
    Args:
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
a$  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                   :     e Zd Zdeeef         f fdZd Z xZS )ClapProjectionLayerr   c                    t                                                       || _        |j        }|j        }t          j        ||          | _        t          |j	                 | _
        t          j        ||          | _        d S rz   )r{   r|   r   r  projection_dimr	   r   linear1r   projection_hidden_act
activationlinear2)rr   r   r  r  r~   s       r(   r|   zClapProjectionLayer.__init__c  si    (.yn== !=>y@@r*   c                     |                      |          }|                     |          }|                     |          }|S rz   )r  r  r  r;  s     r(   r   zClapProjectionLayer.forwardm  s;    ]3366]33r*   )	r[   r\   r]   r   r   r   r|   r   r   r   s   @r(   r  r  b  sd        Au_n%DE A A A A A A      r*   r  c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )ClapTextEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                 l   t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j
        |j                  | _        t          j        |j        |j                  | _        t          j        |j                  | _        t#          |dd          | _        |                     dt)          j        |j                                      d          d           |                     d	t)          j        | j                                        t(          j        
          d           |j        | _        t          j        |j        |j        | j                  | _	        d S )N)rF   rF  position_embedding_typeabsoluteposition_ids)r   r/   T)
persistenttoken_type_idsr   )r{   r|   r	   	Embedding
vocab_sizer  pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr   rK  r   rA  r   rn   r  r   rA   rP   expandr   r  r   rD   rF   rr   r   r~   s     r(   r|   zClapTextEmbeddings.__init__{  s}   !|F,=v?Q_e_rsss#%<0NPVPb#c#c %'\&2H&J\%]%]" f&8f>STTTz&"<=='.v7PR\']']$EL)GHHOOPWXXei 	 	
 	
 	
 	ek$*;*@*@*B*B%*UUUbf 	 	
 	
 	

 ".#%<*F,>DL\$
 $
 $
   r*   Nr   c                    |.|t          || j        |          }n|                     |          }||                                }n|                                d d         }|d         }|mt	          | d          r2| j        d d d |f         }|                    |d         |          }	|	}n+t          j        |t          j	        | j
        j                  }||                     |          }|                     |          }
||
z   }| j        dk    r|                     |          }||z  }|                     |          }|                     |          }|S )Nr/   r   r  r   r   r  )rJ   rF   &create_position_ids_from_inputs_embedsr   hasattrr  r  rA   r   rD   r  rO   r  r  r  r  r   r   )rr   rE   r  r  inputs_embedsrG   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  
embeddingsr  s                r(   r   zClapTextEmbeddings.forward  s{    $A)TM]_uvv#JJ=YY #..**KK',,..ss3K ^

 !t-.. m*.*=aaa*n*M'3J3Q3QR]^_R`bl3m3m0!A!&[
SWSdSk!l!l!l  00;;M $ : :> J J"%::
':55"&":":<"H"H--J^^J//
\\*--
r*   c                    |                                 dd         }|d         }t          j        | j        dz   || j        z   dz   t          j        |j                  }|                    d                              |          S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr/   r   r   r   )r   rA   rP   rF   rD   rO   r  r  )rr   r  r   sequence_lengthr  s        r(   r  z9ClapTextEmbeddings.create_position_ids_from_inputs_embeds  s     $((**3B3/%a.|q /D4D"Dq"HPUPZcpcw
 
 
 %%a((//<<<r*   )NNNNr   )r[   r\   r]   r^   r|   r   r  r   r   s   @r(   r  r  u  sm         

 
 
 
 
4 rs& & & &P= = = = = = =r*   r  c                   ,    e Zd Zd fd	Zdej        dej        fdZ	 	 	 	 	 	 ddej        deej                 d	eej                 d
eej                 deej                 dee	e	ej                                   dee
         de	ej                 fdZ xZS )ClapTextSelfAttentionNc                 D   t                                                       |j        |j        z  dk    r0t	          |d          s t          d|j         d|j         d          |j        | _        t          |j        |j        z            | _        | j        | j        z  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j
        |j        | j                  | _        t          j        |j                  | _        |pt#          |dd          | _        | j        dk    s| j        d	k    r6|j        | _        t          j        d
|j        z  dz
  | j                  | _        |j        | _        d S )Nr   embedding_sizer   r   r   r  r  relative_keyrelative_key_queryr,   r   )r{   r|   r  r   r  r   r@   r   r   r	   r   r   r   r   r   r   r   rn   r  r  r  distance_embedding
is_decoderrr   r   r  r~   s      r(   r|   zClapTextSelfAttention.__init__  s    ::a??PVXhHiHi?8F$6 8 8 48 8 8  
 $*#= #&v'9F<V'V#W#W !58PPYv143EFF
9V/1CDDYv143EFF
z&"EFF'> (
'-zC
 C
$ '>99T=Y]q=q=q+1+ID(&(l1v7U3UXY3Y[_[s&t&tD# +r*   r   rL   c                     |                                 d d         | j        | j        fz   }|                    |          }|                    dddd          S r   r   r   s      r(   r   z*ClapTextSelfAttention.transpose_for_scores  r   r*   Fr"   r   r   encoder_hidden_statesencoder_attention_maskpast_key_valuer   c                 ^   |                      |          }|d u}	|	r||d         }
|d         }|}n4|	rS|                     |                     |                    }
|                     |                     |                    }|}n||                     |                     |                    }
|                     |                     |                    }t	          j        |d         |
gd          }
t	          j        |d         |gd          }nP|                     |                     |                    }
|                     |                     |                    }|                     |          }|d u}| j        r|
|f}t	          j        ||
                    dd                    }| j	        dk    s| j	        dk    rt|j
        d         |
j
        d         }}|r>t	          j        |dz
  t          j        |j        	                              dd          }n:t	          j        |t          j        |j        	                              dd          }t	          j        |t          j        |j        	                              dd          }||z
  }|                     || j        z   dz
            }|                    |j        
          }| j	        dk    rt	          j        d||          }||z   }n?| j	        dk    r4t	          j        d||          }t	          j        d|
|          }||z   |z   }|t+          j        | j                  z  }|||z   }t0          j                            |d          }|                     |          }|||z  }t	          j        ||          }|                    dddd                                          }|                                d d         | j        fz   }|                    |          }|r||fn|f}| j        r||fz   }|S )Nr   r   r,   r=   r/   r  r  r  r   r  zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   ) r   r   r   r   rA   r  r  r  r   r  r   rW  rD   rO   r1   rP   r  r  r  r   einsumr  r  r   r	   rR   r  r   r2   r3   r   r   )rr   r"   r   r   r  r  r  r   r  is_cross_attentionr  r	  r
  	use_cacher  query_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyr  r  r  r  s                               r(   r   zClapTextSelfAttention.forward  sZ    !JJ}55
 3$> 	O."<&q)I(+K3NN 	O11$((;P2Q2QRRI33DJJ?T4U4UVVK3NN'11$((=2I2IJJI33DJJ}4M4MNNK	>!#4i"@aHHHI)^A%6$D!LLLKK11$((=2I2IJJI33DJJ}4M4MNNK//0ABB"$.	? 	6 (5N !<Y5H5HR5P5PQQ'>99T=Y]q=q=q'2'8';Y_Q=O*L w!&j1nEJWdWk!l!l!l!q!q" " "'l%*UbUi!j!j!j!o!oprtu!v!v"\*EJ}OcdddiijkmoppN%6H#'#:#:8dFb;bef;f#g#g #7#:#:AR#:#S#S +~==+0<8H+Wk+l+l(#36N#N  -1EEE16>NP[]q1r1r./4|<LiYm/n/n,#36T#TWs#s +di8P.Q.QQ%/.@ -//0@b/II ,,77  -	9O_kBB%--aAq99DDFF"/"4"4"6"6ss";t?Q>S"S%**+BCC6G]=/22mM]? 	2 11Gr*   rz   NNNNNF)r[   r\   r]   r|   rA   r  r   r   r_   r   r  r   r   r   s   @r(   r  r    s.       , , , , , ,4%el %u| % % % % 7;15=A>BDH,1c c|c !!23c E-.	c
  ((9:c !)): ;c !uU->'?!@Ac $D>c 
u|	c c c c c c c cr*   r  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ClapTextSelfOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j        |j	                  | _
        d S NrF  )r{   r|   r	   r   r  r  r   rK  r   rA  r   r  s     r(   r|   zClapTextSelfOutput.__init__W  sf    Yv163EFF
f&8f>STTTz&"<==r*   r"   r  rL   c                     |                      |          }|                     |          }|                     ||z             }|S rz   r  r   r   r  s      r(   r   zClapTextSelfOutput.forward]  @    

=11]33}|'CDDr*   r  r   s   @r(   r#  r#  V  i        > > > > >U\  RWR^        r*   r#  eagerc                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
ee	e	ej                                   dee
         de	ej                 fdZ xZS )ClapTextAttentionNc                     t                                                       t          |j                 ||          | _        t          |          | _        t                      | _        d S )Nr  )	r{   r|    CLAP_TEXT_SELF_ATTENTION_CLASSES_attn_implementationrr   r#  r   r"  r#  r  s      r(   r|   zClapTextAttention.__init__k  s`    4V5PQ,C
 
 
	 )00EEr*   c                    t          |          dk    rd S t          || j        j        | j        j        | j                  \  }}t          | j        j        |          | j        _        t          | j        j        |          | j        _        t          | j        j	        |          | j        _	        t          | j
        j        |d          | j
        _        | j        j        t          |          z
  | j        _        | j        j        | j        j        z  | j        _        | j                            |          | _        d S r%  r&  r(  s      r(   r+  zClapTextAttention.prune_headss  r,  r*   Fr"   r   r   r  r  r  r   rL   c           	          |                      |||||||          }|                     |d         |          }	|	f|dd          z   }
|
S r.  r/  )rr   r"   r   r   r  r  r  r   r0  r1  r  s              r(   r   zClapTextAttention.forward  sa     yy!"
 
  ;;|AFF#%QRR(88r*   rz   r!  )r[   r\   r]   r|   r+  rA   r  r   r_   r   r  r   r   r   s   @r(   r,  r,  j  s       " " " " " "; ; ;* 7;15=A>BDH,1 | !!23 E-.	
  ((9: !)): ; !uU->'?!@A $D> 
u|	       r*   r,  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ClapTextIntermediatec                    t                                                       t          j        |j        |j                  | _        t          |j        t                    rt          |j                 | _        d S |j        | _        d S rz   )r{   r|   r	   r   r  intermediate_sizer  r   r6  r7  r   r8  r  s     r(   r|   zClapTextIntermediate.__init__  sn    Yv163KLL
f'-- 	9'-f.?'@D$$$'-'8D$$$r*   r"   rL   c                 Z    |                      |          }|                     |          }|S rz   r:  r;  s     r(   r   zClapTextIntermediate.forward  r<  r*   r  r   s   @r(   r4  r4    r=  r*   r4  c                   P     e Zd Z fdZdej        dej        dej        fdZ xZS )ClapTextOutputc                    t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _        t          j	        |j
                  | _        d S r%  )r{   r|   r	   r   r6  r  r  r   rK  r   rA  r   r  s     r(   r|   zClapTextOutput.__init__  sf    Yv79KLL
f&8f>STTTz&"<==r*   r"   r  rL   c                     |                      |          }|                     |          }|                     ||z             }|S rz   r'  r  s      r(   r   zClapTextOutput.forward  r(  r*   r  r   s   @r(   r9  r9    r)  r*   r9  c                       e Zd Z fdZ	 	 	 	 	 	 ddej        deej                 deej                 deej                 deej                 d	eeeej                                   d
ee	         deej                 fdZ
d Z xZS )ClapTextLayerc                    t                                                       |j        | _        d| _        t	          |          | _        |j        | _        |j        | _        | j        r/| j        st          |  d          t	          |d          | _	        t          |          | _        t          |          | _        d S )Nr   z> should be used as a decoder model if cross attention is addedr  r.  )r{   r|   rH  seq_len_dimr,  rM  r  add_cross_attentionr   crossattentionr4  rQ  r9  r   r  s     r(   r|   zClapTextLayer.__init__  s    '-'E$*622 +#)#= # 	`? j D!h!h!hiii"3FT^"_"_"_D088$V,,r*   NFr"   r   r   r  r  r  r   rL   c           	         |
|d d         nd }|                      |||||          }	|	d         }
| j        r|	dd         }|	d         }n
|	dd          }d }| j        rp|nt          | d          st          d|  d          |
|d	d          nd }|                     |
||||||          }|d         }
||dd         z   }|d         }||z   }t          | j        | j        | j        |
          }|f|z   }| j        r||fz   }|S )
Nr,   )r   r  r   r   r/   rA  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r  )	rM  r  r  r   rA  r   feed_forward_chunkrH  r?  )rr   r"   r   r   r  r  r  r   self_attn_past_key_valueself_attention_outputsr1  r  present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputsrz  s                    r(   r   zClapTextLayer.forward  s    :H9S>"1"#5#5Y] !%/3 "0 "
 "
 2!4 ? 	1,QrT2G 6r :,QRR0G'+$? 	Q4@4!122  Dd D D D   @N?Yrss(;(;_c%&*&9&9 %&)!' '#  7q9 7" ==G ,C2+F( 14P P0#T%A4CSUe
 
  /G+ ? 	5!2 44Gr*   c                 \    |                      |          }|                     ||          }|S rz   )rQ  r   )rr   r1  intermediate_outputrz  s       r(   rC  z ClapTextLayer.feed_forward_chunk  s2    "//0@AA{{#68HIIr*   r!  )r[   r\   r]   r|   rA   r  r   r_   r   r  r   rC  r   r   s   @r(   r=  r=    s       - - - - -" 7;15=A>BDH,1? ?|? !!23? E-.	?
  ((9:? !)): ;? !uU->'?!@A? $D>? 
u|	? ? ? ?B      r*   r=  c                   L    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej        deej                 deej                 deej                 d	eej                 d
eeeej                                   dee	         dee	         dee	         dee	         de
eej                 ef         fdZ xZS )ClapTextEncoderc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S ra   )r=  )rp   r   r   s     r(   r  z,ClapTextEncoder.__init__.<locals>.<listcomp>  s!    #c#c#caM&$9$9#c#c#cr*   F)	r{   r|   r   r	   r  r  num_hidden_layerslayerr  r  s    `r(   r|   zClapTextEncoder.__init__  s`    ]#c#c#c#c5IaCbCb#c#c#cdd
&+###r*   NFTr"   r   r   r  r  past_key_valuesr  r   r  r  rL   c                    |	rdnd }|rdnd }|r| j         j        rdnd }| j        r%| j        r|rt                              d           d}|rdnd }t          | j                  D ]\  }}|	r||fz   }|||         nd }|||         nd }| j        r)| j        r"|                     |j	        |||||||          }n ||||||||          }|d         }|r||d         fz  }|r$||d         fz   }| j         j        r||d         fz   }|	r||fz   }|
st          d |||||fD                       S t          |||||	          S )
Nra   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r/   r   r,   c              3      K   | ]}||V  	d S rz   ra   r  s     r(   rs   z*ClapTextEncoder.forward.<locals>.<genexpr>]  s4       
 
 =  !===
 
r*   )rY   rR  r"   rZ   cross_attentions)r   r@  r  r   loggerwarning_oncer  rQ  r  r  rt   r   )rr   r"   r   r   r  r  rR  r  r   r  r  r  r  all_cross_attentionsnext_decoder_cacher  r  r  r  r{  s                       r(   r   zClapTextEncoder.forward  sB    #7@BBD$5?bb4%6d4;;Zdrr`d& 	"4= 	" "##p   "	#,6RR$(44 #	V #	VOA|# I$58H$H!.7.CillO3B3N_Q//TXN* t}  $ A A )!"#)*"%	! 	! !-!"#)*"%! ! *!,M ;"}R'8&::"  V&9]1=M<O&O#;2 V+?=QRCSBU+U( 	E 1]4D D 	 
 
 "&%'(
 
 
 
 
 
 9+.+*1
 
 
 	
r*   )	NNNNNNFFT)r[   r\   r]   r|   rA   r  r   r_   r   r  r   r   r   r   r   s   @r(   rM  rM    sD       , , , , , 7;15=A>BEI$(,1/4&*S
 S
|S
 !!23S
 E-.	S

  ((9:S
 !)): ;S
 "%e.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\"$MM	NS
 S
 S
 S
 S
 S
 S
 S
r*   rM  c                   B     e Zd Z fdZdej        dej        fdZ xZS )ClapTextPoolerc                     t                                                       t          j        |j        |j                  | _        t          j                    | _        d S rz   )r{   r|   r	   r   r  r  Tanhr  r  s     r(   r|   zClapTextPooler.__init__s  sC    Yv163EFF
'))r*   r"   rL   c                 r    |d d df         }|                      |          }|                     |          }|S rS  )r  r  )rr   r"   first_token_tensorpooled_outputs       r(   r   zClapTextPooler.forwardx  s@     +111a40

#56666r*   r  r   s   @r(   r[  r[  r  s^        $ $ $ $ $
U\ el        r*   r[  c                   $    e Zd ZdZeZdZdZd ZdS )ClapPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    clapFc                 (   | j         j        }t          |t                    rT|j        j        j                            d|dz             |j        j        j                            d|dz             dS t          |t                    rTt          j                            |j        |dz             t          j                            |j        |dz             dS t          |t          j                  r&|j        j                            d|dz             dS t          |t          j                  r?|j        j                                         |j        j                            d           dS t          |t          j        t          j        f          rr| j         j        dz  d| j         j        z  dz  z  |z  }t          j                            |j        |           |j        "|j        j                                         dS dS dS )	zInitialize the weightsr   g{Gz?)meanstd)rf  g      ?g      r,   N)r   initializer_factorr   r  r  weightdatanormal_r  	ClapModelr	   initlogit_scale_alogit_scale_tr  r   r   zero_fill_r   r   r  rP  )rr   modulefactorin_proj_stds       r(   _init_weightsz!ClapPreTrainedModel._init_weights  s   /f011 	)&-2::RV:WWW(/4<<#6TX=<YYYYY	** 	)GOOF0ftmODDDGOOF0ftmODDDDD-- 
	)M&&CVd]&CCCCC-- 	)K""$$$M$$S)))))BI 677 	);2D8a$+B_>_dh=hilrrKGOOFM{O;;;{& &&(((((		) 	) '&r*   N)	r[   r\   r]   r^   r   config_classbase_model_prefixsupports_gradient_checkpointingrt  ra   r*   r(   rb  rb    s@         
 L&+#) ) ) ) )r*   rb  c                       e Zd ZeZdZdef fdZdej        fdZ	 e
e           eee          	 	 	 	 	 ddeej                 deej                 d	ee         d
ee         dee         deeef         fd                        Z xZS )ClapAudioModelr  r   c                     t                                          |           t          |          | _        |                                  d S rz   )r{   r|   r  audio_encoder	post_initr  s     r(   r|   zClapAudioModel.__init__  sA       -f55r*   rL   c                 $    | j         j        j        S rz   )r{  r  r   rv   s    r(   get_input_embeddingsz#ClapAudioModel.get_input_embeddings  s    !-22r*   output_typeru  Nr  r   r  r  c                     ||n| j         j        }||n| j         j        }||n| j         j        }|                     |||||          S )a  
        Returns:

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapAudioModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```Nr  r  r   r  r  )r   use_return_dictr   r  r{  )rr   r  r  r   r  r  s         r(   r   zClapAudioModel.forward  su    < &1%<kk$+B]1B1N--TXT_Tq$8$D  $+Jj 	 !!)/!5# " 
 
 	
r*   NNNNN)r[   r\   r]   r   ru  main_input_namer|   r	   r  r~  r   CLAP_AUDIO_INPUTS_DOCSTRINGr   r   r   rA   r_   
BoolTensorr  r   r   r   r   r   s   @r(   ry  ry    s.       "L&O      3bi 3 3 3 3 +*+FGG+ETcddd 7;04,0/3&*(
 (
 !23(
 E,-(
 $D>	(

 'tn(
 d^(
 
u00	1(
 (
 (
 ed HG(
 (
 (
 (
 (
r*   ry  c                       e Zd ZdZeZd fd	Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                 de	e
j                 d	e	e
j                 d
e	e
j                 de	e
j                 de	e
j                 de	e
j                 de	e
j                 de	ee
j                          de	e         de	e         de	e         de	e         deee
j                 ef         fdZ xZS )ClapTextModela*  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

    Tc                     t                                          |           || _        t          |          | _        t          |          | _        |rt          |          nd | _        | 	                                 d S rz   )
r{   r|   r   r  r  rM  encoderr[  poolerr|  )rr   r   add_pooling_layerr~   s      r(   r|   zClapTextModel.__init__  sq       ,V44&v..0AKnV,,,t 	r*   c                     | j         j        S rz   r  r  rv   s    r(   r~  z"ClapTextModel.get_input_embeddings  s    ..r*   c                     || j         _        d S rz   r  rr   r   s     r(   set_input_embeddingsz"ClapTextModel.set_input_embeddings  s    */'''r*   NrE   r   r  r  r   r  r  r  rR  r  r   r  r  rL   c                 >   ||n| j         j        }||n| j         j        }||n| j         j        }| j         j        r|
|
n| j         j        }
nd}
||t          d          |+|                     ||           |                                }n.||                                dd         }nt          d          |\  }}||j	        n|j	        }|	|	d         d         j
        d         nd}|t          j        |||z   f|          }|gt          | j        d	          r1| j        j        ddd|f         }|                    ||          }|}n!t          j        |t          j        |
          }|                     ||          }| j         j        rL|J|                                \  }}}||f}|t          j        ||          }|                     |          }nd}|                     || j         j                  }|                     |||||          }|                     ||||||	|
|||
  
        }|d         }| j        |                     |          nd}|s||f|dd         z   S t3          |||j        |j        |j        |j                  S )a  
        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer/   z5You have to specify either input_ids or inputs_embedsr   r,   rN   r  r   )rE   r  r  r  rG   )	r   r   r  r  rR  r  r   r  r  r   )rY   r  rR  r"   rZ   rU  )r   r   r  r  r  r  r   %warn_if_padding_and_no_attention_maskr   rO   r   rA   onesr  r  r  r  r   rD   get_extended_attention_maskinvert_attention_maskget_head_maskrP  r  r  r   rR  r"   rZ   rU  )rr   rE   r   r  r  r   r  r  r  rR  r  r   r  r  r   r$   r  rO   rG   r  r  extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr`  s                                  r(   r   zClapTextModel.forward   sS   H 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B];! 	%.%:		@UIII ]%>cddd"66y.QQQ#..**KK&',,..ss3KKTUUU!,
J%.%:!!@T DSC^!3A!6!<Q!?!?de!"Z*jCY6Y)ZdjkkkN!t(899 [*./*HKZK*X'3J3Q3QR\^h3i3i0!A!&[
SY!Z!Z!Z 150P0PQ_al0m0m ;! 	3&;&G=R=W=W=Y=Y: 7$68O#P %-).4HQW)X)X)X&.2.H.HI_.`.`++.2+ &&y$+2OPP	??%)'#9 + 
 
 ,,2"7#B+/!5# ' 
 
 *!,8<8OO444UY 	J#]3oabb6III;-'+;)7&1,=
 
 
 	
r*   )T)NNNNNNNNNNNNN)r[   r\   r]   r^   r   ru  r|   r~  r  r   rA   r  r   r_   r  r   r   r   r   r   r   s   @r(   r  r    s         "L
 
 
 
 
 
/ / /0 0 0
 -11515/3,0048<9==A$(,0/3&*@
 @
EL)@
 !.@
 !.	@

 u|,@
 EL)@
  -@
  (5@
 !) 6@
 "$u'8"9:@
 D>@
 $D>@
 'tn@
 d^@
 
uU\"$PP	Q@
 @
 @
 @
 @
 @
 @
 @
r*   r  c                       e Zd ZeZdef fdZ ee          	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 dee         dee         d	ee         d
e	j        fd            Z ee          	 	 	 	 	 	 ddee	j
                 dee	j
                 dee	j
                 dee         dee         d	ee         d
e	j        fd            Z ee           eee          	 	 	 	 	 	 	 	 	 ddee	j                 dee	j                 dee	j                 dee	j
                 dee	j                 dee         dee         dee         d	ee         d
eeef         fd                        Z xZS )rk  r   c                 J   t                                          |           t          |j        t                    s%t          dt          |j                   d          t          |j        t                    s%t          dt          |j                   d          |j        }|j        }t          j
        t          j        t          j        |j                                      | _        t          j
        t          j        t          j        |j                                      | _        |j        | _        t'          |          | _        t+          |          | _        t/          |          | _        t+          |          | _        |                                  d S )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )r{   r|   r   text_configr   	TypeErrortypeaudio_configr   r	   r   rA   rW  r  loglogit_scale_init_valuerm  rn  r  r  
text_modelr  text_projectionry  audio_modelaudio_projectionr|  )rr   r   r  r  r~   s       r(   r|   zClapModel.__init__  su      &,n== 	0+,,0 0 0  
 &-?? 	1,--1 1 1  
 (*\%,tx@]7^7^*_*_``\%,tx@]7^7^*_*_``$3'442;??),77 3L A A 	r*   NrE   r   r  r   r  r  rL   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     ||||||          }||d         n|j        }|                     |          }	t          j        |	d          }	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`ClapTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```NrE   r   r  r   r  r  r   r/   r=   )	r   r   r  r  r  r  r  F	normalize)
rr   rE   r   r  r   r  r  text_outputsr`  text_featuress
             r(   get_text_featureszClapModel.get_text_features  s    6 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B])%/!5# ' 
 
 ,7+BQHb,,];;Mr:::r*   r  r  c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |||          }|s|d         n|j        }|                     |          }	t          j        |	d          }	|	S )a  
        Returns:
            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
            applying the projection layer to the pooled output of [`ClapAudioModel`].

        Examples:

        ```python
        >>> from transformers import AutoFeatureExtractor, ClapModel
        >>> import torch

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))
        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> audio_features = model.get_audio_features(**inputs)
        ```N)r  r  r  r   r/   r=   )	r   r   r  r  r  r  r  r  r  )
rr   r  r  r   r   r  r  audio_outputsr`  audio_featuress
             r(   get_audio_featureszClapModel.get_audio_features  s    6 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B](()# ) 
 
 1<\a((A\..}==^<<<r*   r  return_lossc
           	         ||n| j         j        }||n| j         j        }|	|	n| j         j        }	|                     |||||	          }
|                     ||||||	          }|	s|
d         n|
j        }|                     |          }|	s|d         n|j        }|                     |          }||	                    ddd          z  }||	                    ddd          z  }| j
                                        }| j                                        }t          j        ||                                          |z  }t          j        ||                                          |z  }d}|r8t!          |          }t!          |                                          }||z   d	z  }|	s||||||
f}||f|z   n|S t#          |||||||

          S )a  
        Returns:

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

        >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]

        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)
        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        ```Nr  r  r   r,   r/   T)pr>   keepdimg       @)rg   rh   ri   rX   rd   rj   rk   )r   r   r  r  r  r  r  r  r  r   rn  exprm  rA   r  trU   rf   )rr   rE   r  r  r   r  r  r   r  r  r  r  rd   rX   logit_scale_textlogit_scale_audiori   rh   rg   caption_loss
audio_lossr   s                         r(   r   zClapModel.forward  sD   L 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B](()/!5# ) 
 
 )%/!5# ' 
 
 0;[}Q''@[,,\::-8Xl1ool>X**;77 $l&7&7!T&7&R&RR!K$4$4qb$$4$O$OO  -1133 .2244,{LNN4D4DEEHXX <kmmooFFIZZ 	5+O<<L)*:*<*<*>*>??J :-4D 	F&lT`bopF)-)9TGf$$vE-+#%*,
 
 
 	
r*   NNNNNN)	NNNNNNNNN)r[   r\   r]   r   ru  r|   r   CLAP_TEXT_INPUTS_DOCSTRINGr   rA   r  r  r_   r  r  r  CLAP_INPUTS_DOCSTRINGr   rf   
LongTensorr  r   r   r   r   r   s   @r(   rk  rk    s       Lz      @ +*+EFF -115/3,0/3&*- -EL)- !.- u|,	-
 $D>- 'tn- d^- 
	- - - GF-^ +*+FGG 26,015,0/3&*+ + .+ EL)+ !.	+
 $D>+ 'tn+ d^+ 
	+ + + HG+Z +*+@AA:JOOO 156:041537&*,0/3&*]
 ]
E,-]
 !!23]
 E,-	]

 !.]
 u/0]
 d^]
 $D>]
 'tn]
 d^]
 
uj 	!]
 ]
 ]
 PO BA]
 ]
 ]
 ]
 ]
r*   rk  zf
    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
    c                   4    e Zd ZeZdef fdZdej        fdZd Z	 e
e           eee          	 	 	 	 	 	 ddeej                 d	eej                 d
eej                 dee         dee         dee         deeef         fd                        Z xZS )ClapTextModelWithProjectionr   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S rz   )r{   r|   r  r  r  r  r|  r  s     r(   r|   z$ClapTextModelWithProjection.__init__p  sP       '//26::r*   rL   c                 $    | j         j        j        S rz   r  r  r  rv   s    r(   r~  z0ClapTextModelWithProjection.get_input_embeddingsw  s    )99r*   c                 (    || j         j        _        d S rz   r  r  s     r(   r  z0ClapTextModelWithProjection.set_input_embeddingsz  s    5:"222r*   r  NrE   r   r  r   r  r  c                 H   ||n| j         j        }|                     ||||||          }|s|d         n|j        }|                     |          }	|s.|	|d         f|dd         z   }
t          d |
D                       S t          |	|j        |j        |j	                  S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```Nr  r   r   r,   c              3      K   | ]}||V  	d S rz   ra   rp   r   s     r(   rs   z6ClapTextModelWithProjection.forward.<locals>.<genexpr>  '      LLF9K9K9K9K9KLLr*   )rX   rY   r"   rZ   )
r   r  r  r  r  rt   rW   rY   r"   rZ   )rr   rE   r   r  r   r  r  r  r`  rX   r  s              r(   r   z#ClapTextModelWithProjection.forward}  s    6 &1%<kk$+B])%/!5# ' 
 
 0;ZQ@Z**=99 	M"LO4|ABB7GGGLLgLLLLLL"#*<&4#.	
 
 
 	
r*   r  )r[   r\   r]   r   ru  r|   r	   r  r~  r  r   r  r   rW   r   rA   r  r  r   r   r   r   r   s   @r(   r  r  g  sK        "L~      :bi : : : :; ; ; +*+EFF+>^\\\ -115/3,0/3&*1
 1
EL)1
 !.1
 u|,	1

 $D>1
 'tn1
 d^1
 
u))	*1
 1
 1
 ]\ GF1
 1
 1
 1
 1
r*   r  zg
    CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
    c                       e Zd ZeZdZdef fdZdej        fdZ	 e
e           eee          	 	 	 	 	 ddeej                 deej                 d	ee         d
ee         dee         deeef         fd                        Z xZS )ClapAudioModelWithProjectionr  r   c                     t                                          |           t          |          | _        t	          |          | _        |                                  d S rz   )r{   r|   ry  r  r  r  r|  r  s     r(   r|   z%ClapAudioModelWithProjection.__init__  sQ       )&11 3F ; ;r*   rL   c                 .    | j         j        j        j        S rz   )r  r{  r  r   rv   s    r(   r~  z1ClapAudioModelWithProjection.get_input_embeddings  s    -9>>r*   r  Nr  r   r  r  c                    ||n| j         j        }||n| j         j        }||n| j         j        }|                     |||||          }|s|d         n|j        }|                     |          }|s.||d         f|dd         z   }	t          d |	D                       S t          ||j	        |j
        |j                  S )a  
        Returns:

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```Nr  r   r   r,   c              3      K   | ]}||V  	d S rz   ra   r  s     r(   rs   z7ClapAudioModelWithProjection.forward.<locals>.<genexpr>  r  r*   )rd   rY   rZ   r"   )r   r  r   r  r  r  r  rt   rc   rY   rZ   r"   )
rr   r  r  r   r  r  r  r`  rd   r  s
             r(   r   z$ClapAudioModelWithProjection.forward  s   : &1%<kk$+B]1B1N--TXT_Tq$8$D  $+Jj 	 (()/!5# ) 
 
 1<\a((A\,,];; 	M#]1%56qrr9JJGLLgLLLLLL#%+=$/'5	
 
 
 	
r*   r  )r[   r\   r]   r   ru  r  r|   r	   r  r~  r   r  r   rc   r   rA   r_   r  r  r   r   r   r   r   s   @r(   r  r    s/        #L&O      ?bi ? ? ? ? +*+FGG+?o^^^ 7;04,0/3&*6
 6
 !236
 E,-6
 $D>	6

 'tn6
 d^6
 
u**	+6
 6
 6
 _^ HG6
 6
 6
 6
 6
r*   r  r|  )Wr^   r   r  dataclassesr   typingr   r   r   r   r   rA   torch.nn.functionalr	   rR   r  activationsr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   r   utilsr   r   r   r   r   r   configuration_clapr   r   r   
get_loggerr[   rV  _CHECKPOINT_FOR_DOCr)   r9   r;   rJ   r  rU   rW   rc   rf   r  rx   r   r   r   r  r   r3  r?  rD  r  r  r  CLAP_START_DOCSTRINGr  r  r  r  r  r  r#  r/  r,  r4  r9  r=  rM  r[  rb  ry  r  rk  r  r  ra   r*   r(   <module>r     s          ! ! ! ! ! ! 4 4 4 4 4 4 4 4 4 4 4 4 4 4                 ! ! ! ! ! !         
 . - - - - - v v v v v v v v v v v v                L K K K K K K K K K 
	H	%	%.   "  *  (4 4 4 4$7U\ 7el 7 7 7 7
 ? ? ? ? ?+ ? ? ?8 ? ? ? ? ?; ? ? ?8 !
 !
 !
 !
 !
 !
 !
 !
J    29   2% % % % %	 % % %P_ _ _ _ _") _ _ _Fa a a a aRY a a aJ
 
 
 
 
") 
 
 
# # # # # # # #N    BI    	 	 	 	 	bi 	 	 	z z z z zRY z z z|8 8 8 8 8RY 8 8 8x3 3 3 3 3BI 3 3 3lG
 G
 G
 G
 G
ry G
 G
 G
T  @ $# L    ")   &V= V= V= V= V= V= V= V=tC C C C CBI C C CN        "$  0 0 0 0 0	 0 0 0h    29        RY   S S S S SBI S S SnZ
 Z
 Z
 Z
 Z
bi Z
 Z
 Z
|    RY   ) ) ) ) )/ ) ) )B7
 7
 7
 7
 7
( 7
 7
 7
td
 d
 d
 d
 d
' d
 d
 d
N *++`
 `
 `
 `
 `
# `
 `
 ,+`
F  	 C
 C
 C
 C
 C
"5 C
 C
 C
L  	 F
 F
 F
 F
 F
#6 F
 F
 F
 F
 F
r*   