
    g:v                        d Z ddlZddlmZ ddlmZ ddlZddlZddl	m
Z
 ddlm
c mZ ddlmZ ddlmZmZmZmZ dd	lmZ d
Ze G d de                      Ze G d de                      Ze G d de                      Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z G d de
j                  Z  G d de
j                  Z! G d de
j                  Z" G d d e
j                  Z# G d! d"e          Z$d#Z%d$Z& ed%e%           G d& d'e$                      Z'dS )(zTransformers DAC model.    N)	dataclass)Optional   )PreTrainedModel)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings   )	DacConfigr   c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	ej        ed<   dZ
ej        ed<   dZej        ed<   dS )	DacOutputa.  
    Args:
        loss (`torch.Tensor`):
            Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
        audio_values (`torch.Tensor` of shape `(batch_size, input_length)`):
            Reconstructed audio data.
        quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
            Quantized continuous representation of input.
        audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`):
            Codebook indices for each codebook (quantized discrete representation of input).
        projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
            Projected latents (continuous representation of input before quantization).
    Nlossaudio_valuesquantized_representationaudio_codesprojected_latents)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   
LongTensorr        `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/dac/modeling_dac.pyr   r   (   s           #D%
"""&*L%#***26e/666$(K!(((+/u(/////r   r   c                   t    e Zd ZU dZdZej        ed<   dZej        ed<   dZ	ej        ed<   dZ
ej        ed<   dS )DacEncoderOutputa  
    Args:
        loss (`torch.Tensor`):
            Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
        quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*):
            Quantized continuous representation of input.
        audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
            Codebook indices for each codebook (quantized discrete representation of input).
        projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*):
            Projected latents (continuous representation of input before quantization).
    Nr   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r    ?   sl         
 
 #D%
"""26e/666%)K")))+/u(/////r   r    c                   ,    e Zd ZU dZdZej        ed<   dS )DacDecoderOutputz
    Args:
        audio_values (`torch.FloatTensor`  of shape `(batch_size, input_length)`, *optional*):
            Decoded audio values, obtained using the decoder part of Dac.
    Nr   )r   r   r   r   r   r   r   r   r   r   r   r"   r"   S   s1           '+L%#*****r   r"   c                   (     e Zd ZdZ fdZd Z xZS )Snake1dz;
    A 1-dimensional Snake activation function module.
    c                     t                                                       t          j        t	          j        d|d                    | _        d S )Nr   )super__init__nn	Parameterr   onesalpha)self
hidden_dim	__class__s     r   r'   zSnake1d.__init__d   s<    \%*Q
A">">??


r   c                 "   |j         }|                    |d         |d         d          }|| j        dz                                   t	          j        | j        |z                                d          z  z   }|                    |          }|S )Nr   r   g&.>   )shapereshaper+   
reciprocalr   sinpow)r,   hidden_statesr2   s      r   forwardzSnake1d.forwardh   s    #%--eAha"EE%d):(F(F(H(H59UYU_boUoKpKpKtKtuvKwKw(ww%--e44r   )r   r   r   r   r'   r8   __classcell__r.   s   @r   r$   r$   _   sV         @ @ @ @ @      r   r$   c                   4     e Zd ZdZdef fdZd Zd Z xZS )DacVectorQuantizea  
    Implementation of VQ similar to Karpathy's repo (https://github.com/karpathy/deep-vector-quantization)

    Additionally uses following tricks from improved VQGAN
    (https://arxiv.org/pdf/2110.04627.pdf):
        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
            for improved codebook usage
        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
            improves training stability
    configc                 (   t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j	        |j                  | _
        d S )Nr   kernel_size)r&   r'   r(   Conv1dhidden_sizecodebook_dimin_projout_proj	Embeddingcodebook_sizecodebookr,   r=   r.   s     r   r'   zDacVectorQuantize.__init__|   st    y!3V5HVWXXX	&"5v7IWXYYYV%96;NOOr   c                 l   |                      |          }|                     |          \  }}t          j        ||                                d          }t          j        ||                                d          }|||z
                                  z   }|                     |          }|||||fS )aJ  
        Quantizes the input tensor using a fixed codebook and returns the corresponding codebook vectors.

        Args:
            hidden_state (`torch.FloatTensor` of shape `(batch_size, dimension, time_steps)`):
                Input tensor.

        Returns:
            quantized_representation (`torch.Tensor`of shape `(batch_size, dimension, time_steps)`):
                Quantized continuous representation of input.
            commitment_loss (`torch.FloatTensor`of shape `(1)`):
                Commitment loss to train encoder to predict vectors closer to codebook entries.
            codebook_loss (`torch.FloatTensor`of shape `(1)`):
                Codebook loss to update the codebook.
            audio_codes (`torch.LongTensor` of shape `(batch_size, time_steps)`):
                Codebook indices for each codebook, quantized discrete representation of input.
            projected_latents (torch.FloatTensor of shape `(batch_size, num_codebooks * dimension, time_steps)`):
                Projected latents (continuous representation of input before quantization).
        mean)	reduction)rD   decode_latentsFmse_lossdetachrE   )r,   hidden_stater   r   r   commitment_losscodebook_losss          r   r8   zDacVectorQuantize.forward   s    * !LL66040C0CDU0V0V- +*%68P8W8W8Y8Yeklll
#;=N=U=U=W=Wcijjj#48PSd8d7l7l7n7n#n #'==1I#J#J '-Vgggr   c                    |j         \  }}}|                    ddd                              ||z  |          }| j        j        }t          j        |          }t          j        |          }|                    d                              dd          }|d|z  |	                                z  z
   |                    d                              dd          	                                z   }|
                    d          d         }	|	                    |                    d          d          }	|                     |	                              dd          }
|
|	fS )Nr   r1   r   T)keepdimr0   )r2   permuter3   rH   weightrN   	normalizer6   sumtmaxsize	transpose)r,   r7   
batch_sizer-   sequence_length	encodingsrH   l2_normdistindicesr   s              r   rM   z DacVectorQuantize.decode_latents   s<   2?2E/
J!))!Q22:::;WYcdd	=' K	**	;x(( --""&&q$&771y=8::<<7788<<??;N;NqZ^;N;_;_;a;a;c;cc((1++a.//-"4"4Q"7"7<<#'==#9#9#C#CAq#I#I '00r   )	r   r   r   r   r   r'   r8   rM   r9   r:   s   @r   r<   r<   p   sw        	 	Py P P P P P Ph h h@1 1 1 1 1 1 1r   r<   c                   4     e Zd ZdZddedef fdZd Z xZS )	DacResidualUnitza
    A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
       r   	dimensiondilationc                    t                                                       d|z  dz  }t          |          | _        t	          j        ||d||          | _        t          |          | _        t	          j        ||d          | _        d S )N   r1      )r@   rh   paddingr   r?   )	r&   r'   r$   snake1r(   rA   conv1snake2conv2)r,   rg   rh   padr.   s       r   r'   zDacResidualUnit.__init__   s    !a'i((Yy)X_bccc
i((Yy)CCC


r   c                    |}|                      |                     |                    }|                     |                     |                    }|j        d         |j        d         z
  dz  }|dk    r|d|| f         }||z   }|S )ar  
        Forward pass through the residual unit.

        Args:
            hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
                Input tensor .

        Returns:
            output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
                Input tensor after passing through the residual unit.
        r0   r1   r   .)rn   rm   rp   ro   r2   )r,   rQ   output_tensorrl   s       r   r8   zDacResidualUnit.forward   s     %

4;;}#=#=>>

4;;}#=#=>>%b)M,?,CCIQ;;'WgX-=(=>L$}4r   )rf   r   )r   r   r   r   intr'   r8   r9   r:   s   @r   re   re      sr         D D# Dc D D D D D D      r   re   c                   8     e Zd ZdZddededef fdZd Z xZS )	DacEncoderBlockz"Encoder block used in DAC encoder.r   r=   stridestride_indexc           
         t                                                       |j        d|z  z  }t          |dz  d          | _        t          |dz  d          | _        t          |dz  d          | _        t          |dz            | _        t          j
        |dz  |d|z  |t          j        |dz                      | _        d S )Nr1   r   rh   r   	   r@   rw   rl   )r&   r'   encoder_hidden_sizere   	res_unit1	res_unit2	res_unit3r$   rm   r(   rA   mathceilrn   )r,   r=   rw   rx   rg   r.   s        r   r'   zDacEncoderBlock.__init__   s    .L@	(a!DDD(a!DDD(a!DDDi1n--YNI1v:fVZV_`fij`jVkVk
 
 



r   c                     |                      |          }|                     |          }|                     |                     |                    }|                     |          }|S N)r~   r   rm   r   rn   r,   rQ   s     r   r8   zDacEncoderBlock.forward   sX    ~~l33~~l33{{4>>,#?#?@@zz,//r   r   r   	r   r   r   r   r   rt   r'   r8   r9   r:   s   @r   rv   rv      sl        ,,

 

y 

# 

 

 

 

 

 

 

      r   rv   c                   8     e Zd ZdZddededef fdZd Z xZS )	DacDecoderBlockz"Decoder block used in DAC decoder.r   r=   rw   rx   c           
         t                                                       |j        d|z  z  }|j        d|dz   z  z  }t          |          | _        t          j        ||d|z  |t          j        |dz                      | _	        t          |d          | _        t          |d          | _        t          |d          | _        d S )Nr1   r   r|   rz   r   r{   )r&   r'   decoder_hidden_sizer$   rm   r(   ConvTranspose1dr   r   conv_t1re   r~   r   r   )r,   r=   rw   rx   	input_dim
output_dimr.   s         r   r'   zDacDecoderBlock.__init__   s    .!\/A	/19I3JJ
i(()F
Ifqj))
 
 
 )a@@@(a@@@(a@@@r   c                     |                      |          }|                     |          }|                     |          }|                     |          }|                     |          }|S r   )rm   r   r~   r   r   r   s     r   r8   zDacDecoderBlock.forward  s]    {{<00||L11~~l33~~l33~~l33r   r   r   r:   s   @r   r   r      sv        ,,A Ay A# A A A A A A A$      r   r   c                   b     e Zd ZdZdef fdZddefdZdej	        fdZ
d	ej	        fd
Z xZS )DacResidualVectorQuantizez
    ResidualVectorQuantize block - Introduced in SoundStream: An end2end neural audio codec (https://arxiv.org/abs/2107.03312)
    r=   c                     t                                                       j        }j        }|| _        t	          j        fdt          j                  D                       | _        || _        d S )Nc                 .    g | ]}t                    S r   )r<   ).0ir=   s     r   
<listcomp>z6DacResidualVectorQuantize.__init__.<locals>.<listcomp>  s"    (f(f(fq):6)B)B(f(f(fr   )r&   r'   n_codebooksquantizer_dropoutr(   
ModuleListrange
quantizers)r,   r=   r   r   r.   s    `  r   r'   z"DacResidualVectorQuantize.__init__  ss    ("4&-(f(f(f(fERXRdLeLe(f(f(fgg!2r   Nn_quantizersc                 X   d}|}d}d}g }g }||n| j         }| j        rt          j        |j        d         f          | j         z  dz   }t          j        d| j         dz   |j        d         f          }	t          |j        d         | j        z            }
|	d|
         |d|
<   |                    |j	                  }t          | j                  D ]\  }}| j        du r||k    r n ||          \  }}}}}t          j        |j        d         f||j	                  |k     }|||ddddf         z  z   }||z
  }|||z  z  }|||z  z  }|                    |           |                    |           t          j        |d          }t          j        |d          }|||||fS )aQ  
        Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors.
        Args:
            hidden_state (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
                Input tensor to be quantized.
            n_quantizers (`int`, *optional*):
                Number of quantizers to use. If specified and `self.quantizer_dropout` is True,
                this argument is ignored during training, and a random number of quantizers is used.

        Returns:
            quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
                Quantized continuous representation of input.
            audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
                Codebook indices for each codebook (quantized discrete representation of input).
            projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
                Projected latents (continuous representation of input before quantization).
            commitment_loss (`torch.Tensor` of shape `(1)`):
                Commitment loss to train the encoder to predict vectors closer to codebook entries.
            codebook_loss (`torch.Tensor` of shape `(1)`):
                Codebook loss to update the codebook.
        r   Nr   F)
fill_valuedevicedim)r   trainingr   r*   r2   randintrt   r   tor   	enumerater   fullappendstackcat)r,   rQ   r   r   residualrR   rS   r   r   dropout	n_dropoutr   	quantizerquantized_representation_icommitment_loss_icodebook_loss_i	indices_iprojected_latents_imasks                      r   r8   z!DacResidualVectorQuantize.forward"  s   . $% '3'?||TEU= 	@ :|'9!'<&>??$BRRUVVLmAt'7!';l>PQR>S=UVVGL.q1D4JJKKI'.z	z':L)$'??<+>??L%do66 	: 	:LAy}%%!|*;*;mvmvn nj&(9?IWj
 :|1!461\M`aaadppD'?B\_cdededegkmqdq_rBr'r$"<<H 0477O_t33My)))$$%89999k+1555!I&7Q???'6GZgggr   r   c                 l   d}g }|j         d         }t          |          D ]{}| j        |                             |dd|ddf                                       dd          }|                    |           || j        |                             |          z  }||t          j        |d          |fS )a  
        Reconstructs the continuous representation from quantized codes.

        Args:
            audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
                Quantized discrete representation of input.

        Returns:
            quantized_representation (`torch.Tensor`):
                Quantized continuous representation of input.
            projected_latents (`torch.Tensor`):
                List of projected latents (continuous representations of input before quantization)
                for each codebook.
            audio_codes (`torch.Tensor`):
                Codebook indices for each codebook.
        g        r   Nr1   r   )	r2   r   r   rH   r]   r   rE   r   r   )r,   r   r   r   r   r   r   s          r   
from_codesz$DacResidualVectorQuantize.from_codesb  s    " $' !'*{## 	Y 	YA"&/!"4"="=k!!!QPQPQPQ'>R"S"S"]"]^_ab"c"c$$%8999$(:(C(CDW(X(XX$$'3D!)L)L)LkYYr   latentsc                    d}g }g }t          j        dgd | j        D             z             }t          j        |d          }t	          j        ||j        d         k              d                             dd          d         }t          |          D ]}||         ||dz            }
}	| j        |         	                    |dd|	|
ddf                   \  }}|
                    |           |
                    |           | j        |                             |          }||z   }|t          j        |d          fS )a  Reconstructs the quantized representation from unquantized latents.

        Args:
            latents (`torch.Tensor` of shape `(batch_size, total_latent_dimension, time_steps)`):
                Continuous representation of input after projection.

        Returns:
            quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
                Quantized representation of the full-projected space.
            quantized_latents (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
                Quantized representation of the latent space (continuous representation before quantization).
        r   c                     g | ]	}|j         
S r   )rC   )r   qs     r   r   z:DacResidualVectorQuantize.from_latents.<locals>.<listcomp>  s    2[2[2[a1>2[2[2[r   r   r   T)axiskeepdimsN)r   tensorr   cumsumnpwherer2   r[   r   rM   r   rE   r   )r,   r   r   quantized_latentscodescodebook_dims_tensordimsr   r   hidden_dim_jhidden_dim_kquantized_latents_icodes_ir   s                 r   from_latentsz&DacResidualVectorQuantize.from_latents|  sh    $% $|QC2[2[4?2[2[2[,[\\|0a888htw}Q'7788;??QQU?VVWXY{## 	] 	]A)-a$q1u+,L+/?1+=+L+LWUVUVUVXdeqXqstststUtMu+v+v($$%8999LL!!!)-);)D)DEX)Y)Y&'?B\'\$$'3D!)L)L)LLLr   r   )r   r   r   r   r   r'   rt   r8   r   Tensorr   r   r9   r:   s   @r   r   r     s         	3y 	3 	3 	3 	3 	3 	3>h >h# >h >h >h >h@Zel Z Z Z Z4MEL M M M M M M M Mr   r   c                   .     e Zd ZdZdef fdZd Z xZS )
DacDecoderzDAC Decoderr=   c                    t                                                       |j        }|j        }|j        }t          j        ||dd          | _        g }t          |          D ]\  }}|t          |||          gz  }t          j
        |          | _        |j        d|dz   z  z  }t          |          | _        t          j        |ddd          | _        t          j                    | _        d S )Nrk   r   r@   rl   r1   r   )r&   r'   rB   r   upsampling_ratiosr(   rA   rn   r   r   r   blockr$   rm   rp   Tanhtanh)
r,   r=   input_channelchannelsstridesr   rx   rw   r   r.   s
            r   r'   zDacDecoder.__init__  s    *-* Y}hAqQQQ
 $-g$6$6 	E 	E L&offlCCDDEE]5))
/19I3JJ
j))Yz1!QGGG
GII			r   c                     |                      |          }| j        D ]} ||          }|                     |          }|                     |          }|                     |          }|S r   )rn   r   rm   rp   r   )r,   rQ   layers      r   r8   zDacDecoder.forward  sn    zz,//Z 	/ 	/E 5..LL{{<00zz,//yy..r   r   r   r   r   r   r'   r8   r9   r:   s   @r   r   r     sY        y      *
 
 
 
 
 
 
r   r   c                   .     e Zd ZdZdef fdZd Z xZS )
DacEncoderzDAC Encoderr=   c                    t                                                       |j        }t          j        d|j        dd          | _        g | _        t          |          D ]+\  }}|dz   }| xj        t          |||          gz  c_        ,t          j
        | j                  | _        |j        d|z  z  }t          |          | _        t          j        ||j        dd          | _        d S )Nr   rk   r   r   )rw   rx   r1   )r&   r'   downsampling_ratiosr(   rA   r}   rn   r   r   rv   r   r$   rm   rB   rp   )r,   r=   r   rx   rw   d_modelr.   s         r   r'   zDacEncoder.__init__  s    ,Yq&"<!UVWWW

$-g$6$6 	^ 	^ L&'!+LJJ?6&|\\\]]JJJ]4:..
,q,>g&&Yw(:STUUU


r   c                     |                      |          }| j        D ]} ||          }|                     |          }|                     |          }|S r   )rn   r   rm   rp   )r,   rQ   modules      r   r8   zDacEncoder.forward  s]    zz,//j 	0 	0F!6,//LL{{<00zz,//r   r   r:   s   @r   r   r     s`        Vy V V V V V V$	 	 	 	 	 	 	r   r   c                   0    e Zd ZdZeZdZdZd Zd Z	d Z
dS )DacPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
    dacinput_valuesc                     t          |t          j                  rMt          j                            |j        d           t          j                            |j        d           d S d S )Ng{Gz?)stdr   )
isinstancer(   rA   inittrunc_normal_rW   	constant_bias)r,   r   s     r   _init_weightsz DacPreTrainedModel._init_weights  s]    fbi(( 	.G!!&-T!:::Gfk1-----	. 	.r   c                    t           j        j        }t          t           j        j        d          rt           j        j        j        }| j        j        D ]"} ||j                    ||j                   # || j	        j
                    || j	        j                   | j	        j        D ]} ||j
                    ||j        j
                    ||j        j                    ||j        j
                    ||j        j                    ||j        j
                    ||j        j                    || j        j
                    || j        j                   | j        j        D ]} ||j                    ||j        j
                    ||j        j                    ||j        j
                    ||j        j                    ||j        j
                    ||j        j                   d S )Nweight_norm)r(   utilsr   hasattrparametrizationsr   r   rD   rE   encoderrn   rp   r   r~   r   r   decoderr   )r,   r   r   s      r   apply_weight_normz$DacPreTrainedModel.apply_weight_norm  s   h*28,m<< 	@(3?K^. 	( 	(EK&&&K''''DL&'''DL&'''\' 	/ 	/EK$$$K-...K-...K-...K-...K-...K-....DL&'''DL&'''\' 	/ 	/EK&&&K-...K-...K-...K-...K-...K-....	/ 	/r   c                    | j         j        D ]J}t          j                            |j                   t          j                            |j                   Kt          j                            | j        j                   t          j                            | j        j	                   | j        j
        D ]}t          j                            |j                   t          j                            |j        j                   t          j                            |j        j	                   t          j                            |j        j                   t          j                            |j        j	                   t          j                            |j        j                   t          j                            |j        j	                   t          j                            | j        j                   t          j                            | j        j	                   | j        j
        D ]}t          j                            |j                   t          j                            |j        j                   t          j                            |j        j	                   t          j                            |j        j                   t          j                            |j        j	                   t          j                            |j        j                   t          j                            |j        j	                   d S r   )r   r   r(   r   remove_weight_normrD   rE   r   rn   rp   r   r~   r   r   r   r   )r,   r   s     r   r   z%DacPreTrainedModel.remove_weight_norm  sa   ^. 	8 	8EH''666H''7777
##DL$6777
##DL$6777\' 	? 	?EH''444H''(=>>>H''(=>>>H''(=>>>H''(=>>>H''(=>>>H''(=>>>>
##DL$6777
##DL$6777\' 	? 	?EH''666H''(=>>>H''(=>>>H''(=>>>H''(=>>>H''(=>>>H''(=>>>>	? 	?r   N)r   r   r   r   r   config_classbase_model_prefixmain_input_namer   r   r   r   r   r   r   r     s^          L$O. . .
/ / /B? ? ? ? ?r   r   aH  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`DacConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`).
            Audio data to encode,
        n_quantizers (`int`, *optional*):
            Number of quantizers to use. If `None`, all quantizers are used. Default is `None`.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z%The DAC (Descript Audio Codec) model.c            
       l    e Zd Zdef fdZ eee          	 	 ddej	        de
dee         fd            Z eee          	 	 	 dd	eej	                 d
eej	                 dee         fd            Z ee           eee          	 	 ddej	        de
dee         fd                        Z xZS )DacModelr=   c                    t                                          |           || _        t          |          | _        t          |          | _        t          |          | _        t          t          j        | j        j                            | _        d| j        z  | j        j        k    rt          d          |                                  d S )Nr1   z'The codebook_size must be a power of 2.)r&   r'   r=   r   r   r   r   r   r   rt   r   log2rG   bits_per_codebook
ValueError	post_initrI   s     r   r'   zDacModel.__init__O  s       !&))!&))26::!$TYt{/H%I%I!J!Jd$$(AAAFGGG 	r   )output_typer   Nr   r   return_dictc                     ||n| j         j        }|                     |          }|                     ||          \  }}}}}| j         j        |z  | j         j        |z  z   }	|s|	|||fS t          |	|||          S )a  
        Encode given audio data and return quantized latent codes

        Args:
            input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`):
                Input audio data to encode,
            n_quantizers (int, *optional*):
                Number of quantizers to use. If None, all quantizers are used. Default is None.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Returns:

        )r=   r  r   r   commitment_loss_weightcodebook_loss_weightr    )
r,   r   r   r  r   r   r   rR   rS   r   s
             r   encodezDacModel.encode_  s    ( &1%<kk$+BY#'<<#=#= cgcqcq$ld
 d
` +/@/S` {1OCdkFfivFvv 	T2KARSS&>M^___r   r   r   c                    ||t          d          ||n| j        j        }| | j                            |          d         }|                     |                              d          }|s|fS t          |          S )a  Decode given latent codes and return audio data

        Args:
            quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*):
                Quantized continuous representation of input.
            audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
                The codebook indices for each codebook, representing the quantized discrete
                representation of the input. This parameter should be provided if you want
                to decode directly from the audio codes (it will overwrite quantized_representation).
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Returns:

        NzDEither `quantized_representation` or `audio_codes` must be provided.r   r   )r   r=   r  r   r   r   squeezer"   )r,   r   r   r  r   s        r   decodezDacModel.decode  s    . $+0Ccddd%0%<kk$+BY"'+~'@'@'M'Ma'P$||$<==EEaHH 	# ?"---r   c                     ||n| j         j        }|j        d         }|                     ||d          \  }}}}|                     |d          d         dd|f         }	|s||	|||fS t          ||	|||          S )a  
        Returns:
        Examples:

        ```python
        >>> from datasets import load_dataset, Audio
        >>> from transformers import DacModel, AutoProcessor
        >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> model = DacModel.from_pretrained("descript/dac_16khz")
        >>> processor = AutoProcessor.from_pretrained("descript/dac_16khz")
        >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
        >>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
        >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")

        >>> encoder_outputs = model.encode(inputs["input_values"])
        >>> # Get the intermediate audio codes
        >>> audio_codes = encoder_outputs.audio_codes
        >>> # Reconstruct the audio from its quantized representation
        >>> audio_values = model.decode(encoder_outputs.quantized_representation)
        >>> # or the equivalent with a forward pass
        >>> audio_values = model(inputs["input_values"]).audio_values
        ```Nr0   F)r  r   .)r=   r  r2   r  r  r   )
r,   r   r   r  lengthr   r   r   r   r   s
             r   r8   zDacModel.forward  s    @ &1%<kk$+BY#B'IM,E JU J
 J
F&5F {{#;{OOPQRSVX_Y_X_S_` 	b,(@+O`aa|-E{Tefffr   )NN)NNN)r   r   r   r   r'   r
   r    _CONFIG_FOR_DOCr   r   rt   r   boolr  r"   r  r	   DAC_INPUTS_DOCSTRINGr   r8   r9   r:   s   @r   r   r   J  s       
y        +;/ZZZ !&*	` `l` ` d^	` ` ` [Z`B +;/ZZZ <@.2&*	#. #."*5<"8#. el+#. d^	#. #. #. [Z#.J +*+?@@9?SSS !&*	(g (gl(g (g d^	(g (g (g TS A@(g (g (g (g (gr   r   )(r   r   dataclassesr   typingr   numpyr   r   torch.nnr(   torch.nn.functional
functionalrN   modeling_utilsr   r   r   r   r	   r
   configuration_dacr   r  r   r    r"   Moduler$   r<   re   rv   r   r   r   r   r   DAC_START_DOCSTRINGr  r   r   r   r   <module>r     s      ! ! ! ! ! !                           - - - - - -            ) ( ( ( ( (  0 0 0 0 0 0 0 0, 0 0 0 0 0{ 0 0 0& + + + + +{ + + +    bi   "C1 C1 C1 C1 C1	 C1 C1 C1L" " " " "bi " " "J    bi   0    bi   >GM GM GM GM GM	 GM GM GMT" " " " " " " "J       BJ? J? J? J? J? J? J? J?Z    + Cg Cg Cg Cg Cg! Cg Cg	 Cg Cg Cgr   