
    g                    V   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZmZ ddl m!Z!  ej"        e#          Z$dZ%e G d de                      Z&e G d de                      Z'ej(        j)        d             Z*	 	 	 	 	 dBdZ+d Z, G d dej        j-                  Z. G d dej-                  Z/ G d d ej-                  Z0 G d! d"ej-                  Z1 G d# d$ej-                  Z2 G d% d&ej-                  Z3 G d' d(ej-                  Z4 G d) d*ej-                  Z5 G d+ d,ej-                  Z6 G d- d.ej-                  Z7 G d/ d0ej-                  Z8 G d1 d2ej-                  Z9 G d3 d4ej-                  Z: G d5 d6ej-                  Z; G d7 d8ej-                  Z< G d9 d:ej-                  Z= G d; d<e          Z>d=Z?d>Z@ ed?e?           G d@ dAe>                      ZAdS )CzPyTorch VITS model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)BaseModelOutputModelOutput)PreTrainedModel)add_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
VitsConfigr   c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	e
eej                          ed<   dZe
eej                          ed<   dZe
eej                          ed<   dS )VitsModelOutputaC  
    Describes the outputs for the VITS model, with potential hidden states and attentions.

    Args:
        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            The final audio waveform predicted by the model.
        sequence_lengths  (`torch.FloatTensor` of shape `(batch_size,)`):
            The length in samples of each element in the `waveform` batch.
        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
            The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
            GAN decoder model to obtain the final audio waveform.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nwaveformsequence_lengthsspectrogramhidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r   r   r        b/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/vits/modeling_vits.pyr   r   .   s          0 #'He&&&*.e'...6:K% 123:::8<M8E%"345<<<59Ju01299999r&   r   c                       e Zd ZU dZdZej        ed<   dZej        ed<   dZ	ej        ed<   dZ
eeej                          ed<   dZeeej                          ed<   dS )VitsTextEncoderOutputaa  
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted mean values of the prior distribution for the latent text variables.
        prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted log-variance values of the prior distribution for the latent text variables.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlast_hidden_stateprior_meansprior_log_variancesr   r   )r   r   r    r!   r*   r"   r#   r$   r+   r,   r   r   r   r   r%   r&   r'   r)   r)   O   s          . ,0u(///%)K")))-1*1118<M8E%"345<<<59Ju01299999r&   r)   c                     | |z   }t          j        |d d d |d d f                   }t          j        |d d |d d d f                   }||z  }|S N)r"   tanhsigmoid)input_ainput_bnum_channelsin_actt_acts_actactss          r'   fused_add_tanh_sigmoid_multiplyr8   o   sh    wFJvaaa,1233EM&LMM111!4566E5=DKr&   F      @MbP?c	                    | | k    | |k    z  }	|	 }
t          j        |           }t          j        |           }t          j        t          j        d|z
            dz
            }t
          j                            |d          }||d<   ||d<   | |
         ||
<   d||
<   t          | |	         ||	ddf         ||	ddf         ||	ddf         |||||	  	        \  ||	<   ||	<   ||fS )	a	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`, *optional*, defaults to `False`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`, *optional* defaults to 5):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
            applied.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
            limits applied.
    r   )r   r   )pad.r   .        N)	inputsunnormalized_widthsunnormalized_heightsunnormalized_derivativesreverse
tail_boundmin_bin_widthmin_bin_heightmin_derivative)	r"   
zeros_likenplogexpr   
functionalr<   _rational_quadratic_spline)rA   rB   rC   rD   rE   rF   rG   rH   rI   inside_interval_maskoutside_interval_maskoutputslog_abs_detconstants                 r'   (_unconstrained_rational_quadratic_splinerU   x   s<   \ #zk1f
6JK11v&&G"6**KvbfQ/001455H!}001Iv0VV'/V$(0W%%+,A%BG!"),K%&Ga*+/0Daaa0GH12F2IJ!9:NPQPQPQ:Q!R#%%
H 
H 
HDG !;/C#D Kr&   c	                 	   |}	| }
t          j        |           |
k     st          j        |           |	k    rt          d          |j        d         }||z  dk    rt          d| d|           ||z  dk    rt          d| d|           t
          j                            |d          }|d||z  z
  |z  z   }t          j        |d          }t
          j        	                    |d	d
d          }|	|
z
  |z  |
z   }|
|d<   |	|d<   |dddf         |dddf         z
  }|t
          j        
                    |          z   }t
          j                            |d          }|d||z  z
  |z  z   }t          j        |d          }t
          j        	                    |d	d
d          }|	|
z
  |z  |
z   }|
|d<   |	|d<   |dddf         |dddf         z
  }|r|n|}|dxx         dz  cc<   t          j        | d         |k    d          dz
  }|d         }|                    d|          d         }|                    d|          d         }|                    d|          d         }||z  }|                    d|          d         }|                    d|          d         }|dddf                             d|          d         }|                    d|          d         }||z   d|z  z
  }|s| |z
  |z  }|d|z
  z  }|||                    d          z  ||z  z   z  }|||z  z   }|||z  z   } |                    d          ||                    d          z  d|z  |z  z   |d|z
                      d          z  z   z  }!t          j        |!          dt          j        |          z  z
  }"| |"fS | |z
  }#|#|z  }$|||z
  z  |$z   }%||z  |$z
  }&| |#z  }'|&                    d          d|%z  |'z  z
  }(|(dk                                    st!          d|(           d|'z  |& t          j        |(          z
  z  })|)|z  |z   } |)d|)z
  z  }|||z  z   }|                    d          ||)                    d          z  d|z  |z  z   |d|)z
                      d          z  z   z  }!t          j        |!          dt          j        |          z  z
  }"| |" fS )a(	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    z-Input to a transform is not within its domainr?         ?zMinimal bin width z" too large for the number of bins zMinimal bin height dimr   )r   r   rT   r@   )r<   modevaluer=   r>   .Ngư>).N      r   zinvalid discriminant )r"   minmax
ValueErrorshaper   rN   softmaxcumsumr<   softplussumgatherpowrL   allRuntimeErrorsqrt)*rA   rB   rC   rD   rE   rF   rG   rH   rI   upper_boundlower_boundnum_binswidths	cumwidthsderivativesheights
cumheightsbin_locationsbin_idxinput_cumwidthsinput_bin_widthsinput_cumheightsdeltainput_deltainput_derivativesinput_derivatives_plus_oneinput_heightsintermediate1thetatheta_one_minus_theta	numeratordenominatorrR   derivative_numeratorrS   intermediate2intermediate3abcdiscriminantroots*                                             r'   rO   rO      s   X K+Ky;&&%)F*;*;k*I*IHIII"(,Hx#%%imii_giijjj 3&&k~kkaikklll]""#6B"??Fa-("::fDDFV,,,I!!)jPS!TTI{*i7+EI#If$IgsABBw)C"H"55F 2=#9#9:R#S#SSKm##$8b#AAGNX$= =HHGg2...J"":6
RU"VVJ+z9KGJ$Jv%Jwabb!JsCRCx$88G")8JJyM'd"iy)]:CCCaGGi G&&r733F;O}}R11&9!((W55f=fE,,r7++F3K#**2w77?!,S!""W!5!<!<R!I!I&!QNN2w//7M%(BBQ_TM &%/)-== %U 3![599Q<<%?BSVkBk%kl	!M4I$II"Y%<<*q11&15+o 5561u9//!"4"445 

 i 455EIk<R<R8RR## !11%5[+<<=M--=L=(uuQxx!a%!)+!&&(( 	GE|EEFFFA1"uz,7778))O; $D 1!M4I$II*q11&!4+o 5561t8.."3"334 

 i 455EIk<R<R8RR$$r&   c                   6     e Zd Zdedef fdZddZd Z xZS )VitsWaveNetconfig
num_layersc                 B   t                                                       |j        | _        || _        t          j                                        | _        t          j                                        | _        t          j	        |j
                  | _        t          t
          j        j        d          rt
          j        j        j        }nt
          j        j        }|j        dk    rCt          j                            |j        d|j        z  |z  d          } ||d          | _        t'          |          D ]}|j        |z  }|j        |z  |z
  dz  }t          j                            |j        d|j        z  |j        ||          } ||d          }| j                            |           ||dz
  k     rd|j        z  }	n|j        }	t          j                            |j        |	d          }
 ||
d          }
| j                            |
           d S )Nweight_normr   r\   r   weight)name)in_channelsout_channelskernel_sizedilationpadding)super__init__hidden_sizer   r"   r   
ModuleList	in_layersres_skip_layersDropoutwavenet_dropoutdropouthasattrutilsparametrizationsr   speaker_embedding_sizeConv1d
cond_layerrangewavenet_dilation_ratewavenet_kernel_sizeappend)selfr   r   r   r   ir   r   in_layerres_skip_channelsres_skip_layer	__class__s              r'   r   zVitsWaveNet.__init__K  s   !-$,,..$x2244z&"89928,m<< 	/(3?KK(.K(A--)FFL^H^akHkmnooJ)k*8DDDDOz"" 	8 	8A3Q6H1H<xGAMGx".!33"6! '  H #{8(;;;HN!!(+++ :>!!$%(:$:!!$*$6!"X__V-?ARTUVVN([hGGGN ''7777+	8 	8r&   Nc                    t          j        |          }t          j        | j        g          }||                     |          }t          | j                  D ]} | j        |         |          }|*|dz  | j        z  }|d d ||d| j        z  z   d d f         }	nt          j        |          }	t          ||	|d                   }
| 	                    |
          }
 | j
        |         |
          }|| j        dz
  k     r8|d d d | j        d d f         }||z   |z  }||d d | j        d d d f         z   }||z   }||z  S )Nr\   r   r   )r"   rJ   	IntTensorr   r   r   r   r   r8   r   r   )r   rA   padding_maskglobal_conditioningrR   num_channels_tensorr   r   cond_offsetglobal_statesr7   res_skip_actsres_actss                r'   forwardzVitsWaveNet.forwardt  s   "6**#ot/?.@AA*"&//2E"F"Ft'' 	2 	2A-DN1-f55M".!ed&66 3AAA{[STW[WgSgEg7gijijij4j k % 0 ? ?2=-QdefQghhD<<%%D3D03D99M4?Q&&&(,>d.>,>)AB 8+|;!M!!!T5E5G5G2J$KK!M1%%r&   c                 &   | j         dk    r)t          j        j                            | j                   | j        D ]&}t          j        j                            |           '| j        D ]&}t          j        j                            |           'd S )Nr   )r   r"   r   r   remove_weight_normr   r   r   r   layers     r'   r   zVitsWaveNet.remove_weight_norm  s    &!++HN--do>>>^ 	5 	5EHN--e4444) 	5 	5EHN--e4444	5 	5r&   r.   )	r   r   r    r   intr   r   r   __classcell__r   s   @r'   r   r   J  so        '8z '8s '8 '8 '8 '8 '8 '8R& & & &:5 5 5 5 5 5 5r&   r   c                   ,     e Zd Zdef fdZddZ xZS )VitsPosteriorEncoderr   c                 0   t                                                       |j        | _        t	          j        |j        |j        d          | _        t          ||j
                  | _        t	          j        |j        | j        dz  d          | _        d S )Nr   r   r\   )r   r   	flow_sizer   r   r   spectrogram_binsr   conv_prer   $posterior_encoder_num_wavenet_layerswavenet	conv_projr   r   r   s     r'   r   zVitsPosteriorEncoder.__init__  sz    ",	&"96;MqQQ"6f6abbb6#5t7H17LaPPr&   Nc                 6   |                      |          |z  }|                     |||          }|                     |          |z  }t          j        || j        d          \  }}|t          j        |          t          j        |          z  z   |z  }|||fS )Nr   rX   )r   r   r   r"   splitr   
randn_likerM   )r   rA   r   r   statsmean
log_stddevsampleds           r'   r   zVitsPosteriorEncoder.forward  s    v&&5fl4GHHv&&5 ;ud.?QGGGj%*40059Z3H3HHHLXj((r&   r.   r   r   r    r   r   r   r   r   s   @r'   r   r     s_        Qz Q Q Q Q Q Q) ) ) ) ) ) ) )r&   r   c                   :     e Zd Zd
 fd	ZddZd Zd Zd	 Z xZS )HifiGanResidualBlockr	   r   r	      皙?c                 d    t                                                       | _        t          j         fdt          t                              D                        _        t          j         fdt          t                              D                        _        d S )Nc                     g | ]<}t          j        d |                             |                             =S r   )strider   r   r   r   get_padding).0r   channelsr   r   r   s     r'   
<listcomp>z1HifiGanResidualBlock.__init__.<locals>.<listcomp>  sf     
 
 
  	%a[ ,,[(1+FF  
 
 
r&   c                 l    g | ]0}t          j        d d                     d                     1S r   r   )r   _r   r   r   s     r'   r   z1HifiGanResidualBlock.__init__.<locals>.<listcomp>  s^     
 
 
  	 ,,[!<<  
 
 
r&   )	r   r   leaky_relu_sloper   r   r   lenconvs1convs2)r   r   r   r   r   r   s   ```` r'   r   zHifiGanResidualBlock.__init__  s     0m
 
 
 
 
 
 
 s8}}--
 
 

 
 m
 
 
 
 
 
 s8}}--
 
 

 
r&   r   c                     ||z  |z
  dz  S )Nr\   r%   )r   r   r   s      r'   r   z HifiGanResidualBlock.get_padding  s    h&1a77r&   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        }| j        D ]} ||           | j        D ]} ||           d S Nr   )r   r   r   r   r   r   r   r   r   r   s      r'   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm  s    h*28,m<< 	@(3?K[ 	 	EK[ 	 	EK	 	r&   c                     | j         D ]!}t          j                            |           "| j        D ]!}t          j                            |           "d S r.   )r   r   r   r   r   r   s     r'   r   z'HifiGanResidualBlock.remove_weight_norm  s`    [ 	/ 	/EH''....[ 	/ 	/EH''....	/ 	/r&   c                    t          | j        | j                  D ]l\  }}|}t          j                            || j                  } ||          }t          j                            || j                  } ||          }||z   }m|S r.   )zipr   r   r   rN   
leaky_relur   )r   r   conv1conv2residuals        r'   r   zHifiGanResidualBlock.forward  s    T[99 	5 	5LE5$HM44]DDYZZM!E-00MM44]DDYZZM!E-00M)H4MMr&   )r	   r   r   r   )	r   r   r    r   r   r   r   r   r   r   s   @r'   r   r     s~        
 
 
 
 
 
>8 8 8 8  / / /      r&   r   c                   r     e Zd Zdef fdZd Zd Z	 d
dej        de	ej                 dej        fd	Z
 xZS )VitsHifiGanr   c                    t                                                       || _        t          |j                  | _        t          |j                  | _        t          j	        |j
        |j        ddd          | _        t          j                    | _        t          t!          |j        |j                            D ]X\  }\  }}| j                            t          j        |j        d|z  z  |j        d|dz   z  z  ||||z
  dz                       Yt          j                    | _        t+          t          | j                            D ]a}|j        d|dz   z  z  }t!          |j        |j                  D ]4\  }}| j                            t/          ||||j                             5bt          j	        |ddddd          | _        |j        dk    r't          j	        |j        |j        d          | _        d S d S )	N   r   r	   )r   r   r   r\   F)r   r   r   biasr   )r   r   r   r   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   r   r   upsample_initial_channelr   r   	upsampler	enumerater   upsample_kernel_sizesr   ConvTranspose1d	resblocksr   resblock_dilation_sizesr   r   	conv_postr   cond)r   r   r   upsample_rater   r   r   r   s          r'   r   zVitsHifiGan.__init__  s   v;<< !677	+
 
 
 /8V=RTZTp9q9q/r/r 		 		+A+{N!!"31=3a!eE +((=8Q>      s4>**++ 	v 	vA61Q<HH),V-I6Ki)j)j v v%X%%&:8[RZ\b\s&t&tuuuuv 8QAaQRY^___(A--	&"?A`bcddDIII .-r&   c                     t           j        j        }t          t           j        j        d          rt           j        j        j        }| j        D ]} ||           | j        D ]}|                                 d S r   )r   r   r   r   r   r   r  r   r   s      r'   r   zVitsHifiGan.apply_weight_norm  s    h*28,m<< 	@(3?K^ 	 	EK^ 	& 	&E##%%%%	& 	&r&   c                     | j         D ]!}t          j                            |           "| j        D ]}|                                 d S r.   )r   r   r   r   r  r   s     r'   r   zVitsHifiGan.remove_weight_norm  s\    ^ 	/ 	/EH''....^ 	' 	'E$$&&&&	' 	'r&   Nr   r   returnc                 j   |                      |          }|||                     |          z   }t          | j                  D ]}t          j                            || j        j                  } | j	        |         |          } | j
        || j        z           |          }t          d| j                  D ]&}| | j
        || j        z  |z            |          z  }'|| j        z  }t          j                            |          }|                     |          }t          j        |          }|S )aG  
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        Nr   )r   r  r   r   r   rN   r   r   r   r   r  r   r  r"   r/   )r   r   r   r   r   	res_statejr   s           r'   r   zVitsHifiGan.forward   s.    k22*)DII6I,J,JJMt)** 	9 	9AM44]DKD`aaM-DN1-m<<M<q4+;';<]KKI1d.// U UET^A0@,@1,DEmTTT		%(88MM00??}55:m,,r&   r.   )r   r   r    r   r   r   r   r"   r#   r   r   r   r   s   @r'   r   r     s        "ez "e "e "e "e "e "eH& & &' ' ' bf    , CKEL]C^ 		               r&   r   c                   ,     e Zd Zdef fdZddZ xZS )VitsResidualCouplingLayerr   c                 0   t                                                       |j        dz  | _        t	          j        | j        |j        d          | _        t          ||j	                  | _
        t	          j        |j        | j        d          | _        d S )Nr\   r   r   )r   r   r   half_channelsr   r   r   r   r    prior_encoder_num_wavenet_layersr   r  r   s     r'   r   z"VitsResidualCouplingLayer.__init__D  sz    #-2	$"4f6H!LL"6f6]^^^6#5t7I1MMr&   NFc                    t          j        || j        gdz  d          \  }}|                     |          |z  }|                     |||          }|                     |          |z  }t          j        |          }	|sP||t          j        |	          z  |z  z   }t          j        ||gd          }
t          j	        |	ddg          }|
|fS ||z
  t          j        |	           z  |z  }t          j        ||gd          }
|
d fS )Nr\   r   rX   )
r"   r   r  r   r   r  rJ   rM   catre   )r   rA   r   r   rE   
first_halfsecond_halfr   r   r   rR   log_determinants               r'   r   z!VitsResidualCouplingLayer.forwardL  s   "'+ft7I6JQ6NTU"V"V"V
Kj11L@]LBUVV~~m,,|;%d++
 	!uy/D/D!D|!SSKi[ 9qAAAG#i
QF;;OO++&-J;1G1GG,VKi[ 9qAAAGD= r&   NFr   r   s   @r'   r  r  C  s_        Nz N N N N N N! ! ! ! ! ! ! !r&   r  c                   ,     e Zd Zdef fdZddZ xZS )VitsResidualCouplingBlockr   c                     t                                                       t          j                    | _        t          |j                  D ])}| j                            t          |                     *d S r.   )	r   r   r   r   flowsr   prior_encoder_num_flowsr   r  )r   r   r   r   s      r'   r   z"VitsResidualCouplingBlock.__init___  sp    ]__
v566 	A 	AAJ7??@@@@	A 	Ar&   NFc                     |s1| j         D ](} ||||          \  }}t          j        |dg          })n?t          | j                   D ]*}t          j        |dg          } ||||d          \  }}+|S )Nr   TrE   )r  r"   flipreversed)r   rA   r   r   rE   flowr   s          r'   r   z!VitsResidualCouplingBlock.forwarde  s     	Z
 1 1 D7JKK	FQC001 !,, Z ZFQC00 D7JTXYYY	r&   r  r   r   s   @r'   r  r  ^  s_        Az A A A A A A	 	 	 	 	 	 	 	r&   r  c                   .     e Zd Zddef fdZddZ xZS )VitsDilatedDepthSeparableConvr@   r   c                 ,   t                                                       |j        }|j        }|j        | _        t          j        |          | _        t          j	                    | _
        t          j	                    | _        t          j	                    | _        t          j	                    | _        t          | j                  D ]}||z  }||z  |z
  dz  }| j
                            t          j        ||||||                     | j                            t          j        ||d                     | j                            t          j        |                     | j                            t          j        |                     d S )Nr\   )r   r   r   groupsr   r   r   )r   r   duration_predictor_kernel_sizer   depth_separable_num_layersr   r   r   r   r   convs_dilatedconvs_pointwisenorms_1norms_2r   r   r   	LayerNorm)	r   r   dropout_rater   r   r   r   r   r   s	           r'   r   z&VitsDilatedDepthSeparableConv.__init__r  se   ;% ;z,//]__!}}}t'' 	8 	8A"A~H"X-8Q>G%%	 (!) +#%#  	 	 	  ''	(Ha(H(HIIILX 6 6777LX 6 67777	8 	8r&   Nc                 R   |||z   }t          | j                  D ]} | j        |         ||z            } | j        |         |                    dd                                        dd          }t
          j                            |          } | j        |         |          } | j	        |         |                    dd                                        dd          }t
          j                            |          }| 
                    |          }||z   }||z  S Nr   r?   )r   r   r)  r+  	transposer   rN   gelur*  r,  r   )r   rA   r   r   r   r   s         r'   r   z%VitsDilatedDepthSeparableConv.forward  s   *11Ft'' 	, 	,A1D.q1&<2GHHM+DLOM,C,CAr,J,JKKUUVWY[\\MM..}==M3D03MBBM+DLOM,C,CAr,J,JKKUUVWY[\\MM..}==M LL77Mm+FF$$r&   )r@   r.   r   r   s   @r'   r$  r$  q  s]        8 8z 8 8 8 8 8 88% % % % % % % %r&   r$  c                   ,     e Zd Zdef fdZddZ xZS )VitsConvFlowr   c                    t                                                       |j        | _        |j        dz  | _        |j        | _        |j        | _	        t          j        | j        | j        d          | _        t          |          | _        t          j        | j        | j        | j        dz  dz
  z  d          | _        d S )Nr\   r   r	   )r   r   r   filter_channelsdepth_separable_channelsr  duration_predictor_flow_binsrm   duration_predictor_tail_boundrF   r   r   r   r$  conv_ddsr   r   s     r'   r   zVitsConvFlow.__init__  s    %1#<A; >	$"4d6JANN5f==4#79Kt}_`O`cdOd9eghiir&   NFc                    t          j        || j        gdz  d          \  }}|                     |          }|                     |||          }|                     |          |z  }|j        \  }}	}
|                    ||	d|
                              dddd          }|dd | j	        f         t          j        | j                  z  }|d| j	        d| j	        z  f         t          j        | j                  z  }|dd| j	        z  d f         }t          |||||| j                  \  }}t          j        ||gd          |z  }|st          j        ||z  ddg          }||fS |d fS )	Nr\   r   rX   r?   r   r	   .)rE   rF   )r"   r   r  r   r:  r   ra   reshapepermuterm   mathrj   r6  rU   rF   r  re   )r   rA   r   r   rE   r  r  r   
batch_sizer   lengthrB   rC   rD   rS   rR   r  s                    r'   r   zVitsConvFlow.forward  s   "'+ft7I6JQ6NTU"V"V"V
Kj11m\CVWW}55D'1'7$
Hf%--j(BOOWWXY[\^_abcc+C4=,@ADIdNbDcDcc,S$-!dmBS2S-STW[W`aeauWvWvv#0a$-6G6I6I1I#J #K $$
 $
 $
 [ )Z51===L 	!#il(BQFKKOO++D= r&   r  r   r   s   @r'   r4  r4    s_        	jz 	j 	j 	j 	j 	j 	j! ! ! ! ! ! ! !r&   r4  c                   ,     e Zd Zdef fdZddZ xZS )VitsElementwiseAffiner   c                 $   t                                                       |j        | _        t	          j        t          j        | j        d                    | _        t	          j        t          j        | j        d                    | _	        d S Nr   )
r   r   r7  r   r   	Parameterr"   zeros	translate	log_scaler   s     r'   r   zVitsElementwiseAffine.__init__  se    7ek$-&C&CDDek$-&C&CDDr&   NFc                     |sL| j         t          j        | j                  |z  z   }||z  }t          j        | j        |z  ddg          }||fS || j         z
  t          j        | j                   z  |z  }|d fS Nr   r\   )rG  r"   rM   rH  re   )r   rA   r   r   rE   rR   r  s          r'   r   zVitsElementwiseAffine.forward  s     	!nuy'@'@6'IIG,G#i(E1vNNOO++.%)T^O2L2LL|[GD= r&   r  r   r   s   @r'   rB  rB    s_        Ez E E E E E E! ! ! ! ! ! ! !r&   rB  c                   &     e Zd Z fdZddZ xZS )VitsStochasticDurationPredictorc                    t                                                       |j        }|j        }t	          j        ||d          | _        t	          j        ||d          | _        t          ||j	                  | _
        |dk    rt	          j        ||d          | _        t	          j                    | _        | j                            t          |                     t!          |j                  D ])}| j                            t%          |                     *t	          j        d|d          | _        t	          j        ||d          | _        t          ||j	                  | _        t	          j                    | _        | j                            t          |                     t!          |j                  D ])}| j                            t%          |                     *d S )Nr   )r.  r   )r   r   r   r   r   r   r   r   r$  duration_predictor_dropoutr:  r  r   r  r   rB  r   duration_predictor_num_flowsr4  post_conv_prepost_conv_projpost_conv_dds
post_flows)r   r   	embed_dimr6  r   r   s        r'   r   z(VitsStochasticDurationPredictor.__init__  s   1	 ,	/?AFF?OQGG5:
 
 

 >>	)_a@@DI]__

/77888v:;; 	4 	4AJl6223333Yq/1== i!LL::
 
 

 -//4V<<===v:;; 	9 	9AO""<#7#78888	9 	9r&   NFrW   c                    t          j        |          }|                     |          }|,t          j        |          }||                     |          z   }|                     ||          }|                     |          |z  }|s|                     |          }|                     ||          }|                     |          |z  }t          j	        |
                    d          d|
                    d                                        |j        |j                  |z  }d}	|}
| j        D ]1} ||
|||z             \  }
}t          j        |
dg          }
|	|z  }	2t          j        |
ddgd          \  }}|	t          j        t$          j                            |          t$          j                            |           z   |z  ddg          z  }	t          j        dt+          j        dt*          j        z            |dz  z   z  |z  ddg          |	z
  }|t          j        |          z
  |z  }t          j        t          j        |d                    |z  }t          j        | ddg          }t          j        ||gd          }| j        D ].} ||||          \  }}t          j        |dg          }||z  }/t          j        d	t+          j        dt*          j        z            |dz  z   z  |z  ddg          |z
  }||z   S t9          t;          | j                            }|d d
         |d         gz   }t          j	        |
                    d          d|
                    d                                        |j        |j                  |z  }|D ]*}t          j        |dg          } ||||d          \  }}+t          j        |ddgd          \  }}|S )Nr   r\   )devicedtype)r   r   rX         gh㈵>g      ?r?   T)r   rE   )r"   detachr   r  r:  r   rP  rR  rQ  randnsizetorV  rW  rS  r   r   re   r   rN   
logsigmoidr>  rL   pir0   	clamp_minr  r  listr!  )r   rA   r   r   	durationsrE   noise_scaler   random_posteriorlog_determinant_posterior_sumlatents_posteriorr"  r  r  r  logqlog_determinant_sumlatentsnllr  r   log_durations                         r'   r   z'VitsStochasticDurationPredictor.forward  s+   f%%v&&*"',/B"C"Cdii(;<<<Fv|44'',6 5	  ..y99M ..}lKKM //>>MM INN1--q)..2C2CDDGGv}djdpGqq  -.) 0 A A59T%|R_I_6 6 62!? %*J/@1#$F$F!-@--&+k2CaVQR&S&S&S#J)UY))*558P8PR\Q\8]8]]ammpqstou. . ) 	$$(1tw;"7"7;KQ;N"OPS__bcefaghh/0 
 $emJ&?&??<OJ5?:t#D#DEETJ"')ZK!Q"@"@i[ 9qAAAG
 7 7+/4[a+b+b+b(*Wqc22#6##)C48AK#8#8GQJ#GH<WZ[]^Y_``cvvC:$*--..E#2#J%),E FKKNNAv{{1~~>>AA^d^jAkk   c c*Wqc22!T'<V]abbb
#k'Aq6qAAAOL!r&   )NNFrW   r   r   r    r   r   r   r   s   @r'   rL  rL    sU        9 9 9 9 9@@  @  @  @  @  @  @  @ r&   rL  c                   &     e Zd Z fdZddZ xZS )VitsDurationPredictorc                 D   t                                                       |j        }|j        }t	          j        |j                  | _        t	          j        |j	        |||dz            | _
        t	          j        ||j                  | _        t	          j        ||||dz            | _        t	          j        ||j                  | _        t	          j        |dd          | _        |j        dk    r't	          j        |j        |j	        d          | _        d S d S )Nr\   )r   epsr   r   )r   r   r'  "duration_predictor_filter_channelsr   r   rN  r   r   r   conv_1r-  layer_norm_epsnorm_1conv_2norm_2projr   r  )r   r   r   r6  r   s       r'   r   zVitsDurationPredictor.__init__A  s    ; Cz&"CDDi 2O[ZeijZjkkkl?8MNNNi+WbfgWghhhl?8MNNNIoq!44	(A--	&"?ASUVWWDIII .-r&   Nc                    t          j        |          }|,t          j        |          }||                     |          z   }|                     ||z            }t          j        |          }|                     |                    dd                                        dd          }|                     |          }|                     ||z            }t          j        |          }| 	                    |                    dd                                        dd          }|                     |          }| 
                    ||z            }||z  S r0  )r"   rZ  r  rs  reluru  r1  r   rv  rw  rx  )r   rA   r   r   s       r'   r   zVitsDurationPredictor.forwardP  s.   f%%*"',/B"C"Cdii(;<<<FVl233F##V--a4455??2FFf%%Vl233F##V--a4455??2FFf%%6L011$$r&   r.   rl  r   s   @r'   rn  rn  @  sQ        X X X X X% % % % % % % %r&   rn  c                        e Zd ZdZdef fdZdej        dedefdZ		 	 	 	 dd
ej        de
ej                 de
ej                 de
ej                 dedeej        e
ej                 f         fdZd Zd Zd Z xZS )VitsAttentionz?Multi-headed attention with relative positional representation.r   c                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        | j        | j        z  | _	        | j	        dz  | _
        | j	        | j        z  | j        k    r t          d| j         d| j         d          t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        t          j        | j        | j        |j                  | _        | j        rt          j        t)          j        d| j        dz  dz   | j	                  | j
        z            | _        t          j        t)          j        d| j        dz  dz   | j	                  | j
        z            | _        d S d S )NrX  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r   r   r\   )r   r   r   rT  num_attention_heads	num_headsattention_dropoutr   window_sizehead_dimscalingr`   r   Linearuse_biask_projv_projq_projout_projrE  r"   r[  	emb_rel_k	emb_rel_vr   s     r'   r   zVitsAttention.__init__h  s   +3/!-$.8}d*MDN*t~==B\`\j B B/3~B B B  
 iV_UUUiV_UUUiV_UUU	$.$.vWWW 	r\%+a9IA9MPQ9QSWS`*a*adhdp*pqqDN\%+a9IA9MPQ9QSWS`*a*adhdp*pqqDNNN	r 	rr&   tensorseq_lenbszc                     |                     ||| j        | j                                      dd                                          S rJ  )viewr  r  r1  
contiguous)r   r  r  r  s       r'   _shapezVitsAttention._shape  s<    {{3GGQQRSUVWWbbdddr&   NFr   key_value_statesattention_masklayer_head_maskoutput_attentionsr
  c                 ^	   |                                 \  }}}|                     |          | j        z  }	|                     |                     |          d|          }
|                     |                     |          d|          }|| j        z  d| j        f} |                     |	||          j        | }	 |
j        | }
 |j        | }|
                     d          }t          j
        |	|
                    dd                    }|                                 || j        z  ||fk    r2t          d|| j        z  ||f d|                                            | j        ^|                     | j        |          }t          j        |	|                    dd                    }|                     |          }||z  }||                                 |d||fk    r+t          d|d||f d|                                            |                    || j        ||          |z   }|                    || j        z  ||          }t$          j                            |d	          }||                                 | j        fk    r-t          d
| j        f d|                                            |                    dddd          |                    || j        ||          z  }|                    || j        z  ||          }|r=|                    || j        ||          }|                    || j        z  ||          }nd}t$          j                            || j        | j                  }t          j
        ||          }|                                 || j        z  || j        fk    r5t          d|| j        || j        f d|                                            | j        J|                     | j        |          }|                     |          }t          j        ||          }||z  }|                    || j        || j                  }|                    dd          }|                    ||| j                  }|                     |          }||fS )z#Input shape: Batch x Time x Channelr?   r   r\   z$Attention weights should be of size z	, but is NrY  z!Attention mask should be of size rX   z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )r\  r  r  r  r  r  r  r  r  r"   bmmr1  r`   r  _get_relative_embeddingsr  matmul'_relative_position_to_absolute_positionr   rN   rb   r   r  r  '_absolute_position_to_relative_positionr<  rT  r  )r   r   r  r  r  r  r  tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightskey_relative_embeddingsrelative_logitsrel_pos_biasattn_weights_reshaped
attn_probsattn_outputvalue_relative_embeddingsrelative_weightss                          r'   r   zVitsAttention.forward  s    (,,..Wa {{=11DL@ [[]!;!;REE
{{4;;}#=#=r3GGDN*B>
Ct{{<#>>CZP$Z_j1
(|(*5//!$$yz/C/CAq/I/IJJ3#7'"JJJ*dn8LgW^7_ * * %%''* *  
 '&*&C&CDNT[&\&\##l<9P9Z9Z[]_a9b9bccOGGXXLL(L%""$$a'(BBB ta'8Rtt]k]p]p]r]rtt   (,,S$.'7SSVddL',,S4>-A7GTTL},,\r,BB&##%%$.)::: 1t~FW 1 1',,..1 1   +//2q!<<|?P?PQTVZVdfmov?w?wwL',,S4>-A7GTTL 	)
 %1$5$5c4>7T[$\$\!055cDN6JGU\]]LL$(!]**<4<RVR_*``
i
L99#"6!OOO)CRVR_3` ) )$$&&) )  
 '(,(E(EdnV](^(^%#KKJWW <(8:STTL<'K!&&sDNGT]SS!++Aq11 "))#wGGmmK00111r&   c           	          t          || j        dz   z
  d          }|dk    r&t          j                            |dd||ddg          }t          | j        dz   |z
  d          }|d|z  z   dz
  }|d d ||f         S )Nr   r   r\   )r_   r  r   rN   r<   )r   relative_embeddingsr@  
pad_lengthslice_start_positionslice_end_positions         r'   r  z&VitsAttention._get_relative_embeddings  s    4#3a#78!<<
>>"$-"3"34G!QPZ\fhiklIm"n"n"D$4q$8F#BAFF1AJ>B"111&:;M&M#MNNr&   c                 l   |                                 \  }}}t          j                            |g d          }|                    ||dz  |z  g          }t          j                            |d|dz
  ddg          }|                    ||dz   d|z  dz
  g          }|d d d ||dz
  d f         }|S )N)r   r   r   r   r   r   r\   r   r   r\  r   rN   r<   r  r   xbatch_headsr@  r   x_flatx_finals          r'   r  z5VitsAttention._relative_position_to_absolute_position  s    !"VQ Ma!3!3!344 fqj6&9:;;""6Avz1a+@AA ++{FQJF
QGHH!!!WfWfqjll23r&   c           	      d   |                                 \  }}}t          j                            |d|dz
  ddddg          }|                    ||d|z  dz
  z  g          }t          j                            ||dddg          }|                    ||d|z  g          d d d d dd f         }|S )Nr   r   r\   r  r  s          r'   r  z5VitsAttention._absolute_position_to_relative_position  s    !"VQ Ma!VaZAq!!<==fF
Q&?@AA ""6FAq!+<==++{FAJ?@@AAAqrrJr&   )NNNF)r   r   r    r!   r   r   r"   Tensorr   r  r   boolr   r   r  r  r  r   r   s   @r'   r|  r|  e  sJ       IIrz r r r r r r2eU\ eC ec e e e e 481526"'`2 `2|`2 #5<0`2 !.	`2
 "%,/`2  `2 
u|Xel33	4`2 `2 `2 `2DO O O  
 
 
 
 
 
 
r&   r|  c                   $     e Zd Z fdZd Z xZS )VitsFeedForwardc                    t                                                       t          j        |j        |j        |j                  | _        t          j        |j        |j        |j                  | _        t          j	        |j
                  | _        t          |j        t                    rt          |j                 | _        n|j        | _        |j        dk    r&|j        dz
  dz  }|j        dz  }||ddddg| _        d S d | _        d S )Nr   r\   r   )r   r   r   r   r   ffn_dimffn_kernel_sizers  rv  r   activation_dropoutr   
isinstance
hidden_actstrr
   act_fnr   )r   r   pad_left	pad_rightr   s       r'   r   zVitsFeedForward.__init__  s    i 2FNFDZ[[i0BFDZ[[z&";<<f'-- 	, !23DKK +DK!A%%.2q8H.!3I$iAq!<DLLLDLLLr&   c                    |                     ddd          }|                     ddd          }||z  }| j        %t          j                            || j                  }|                     |          }|                     |          }|                     |          }||z  }| j        %t          j                            || j                  }|                     |          }||z  }|                     ddd          }|S )Nr   r\   r   )	r=  r   r   rN   r<   rs  r  r   rv  )r   r   r   s      r'   r   zVitsFeedForward.forward  s    %--aA66#++Aq!44%4<#M--mT\JJMM22M22]33%4<#M--mT\JJMM22%4%--aA66r&   rl  r   s   @r'   r  r    sG                 $      r&   r  c            	       l     e Zd Zdef fdZ	 	 d
dej        dej        deej                 de	fd	Z
 xZS )VitsEncoderLayerr   c                 h   t                                                       t          |          | _        t	          j        |j                  | _        t	          j        |j	        |j
                  | _        t          |          | _        t	          j        |j	        |j
                  | _        d S )Nrp  )r   r   r|  	attentionr   r   hidden_dropoutr   r-  r   rt  
layer_normr  feed_forwardfinal_layer_normr   s     r'   r   zVitsEncoderLayer.__init__6  s    &v..z&"788,v'9v?TUUU+F33 "V-?VEZ [ [ [r&   NFr   r   r  r  c                 :   |}|                      |||          \  }}|                     |          }|                     ||z             }|}|                     ||          }|                     |          }|                     ||z             }|f}|r||fz  }|S )N)r   r  r  )r  r   r  r  r  )r   r   r   r  r  r   r  rR   s           r'   r   zVitsEncoderLayer.forward>  s     !&*nn')/ '5 '
 '
#| ]33=(@AA ))-FF]33--h.FGG " 	'&Gr&   r  )r   r   r    r   r   r"   r  r#   r   r  r   r   r   s   @r'   r  r  5  s        \z \ \ \ \ \ \ 26"' | ' !.	
         r&   r  c                        e Zd Zdef fdZ	 	 	 	 ddej        dej        deej                 dee	         dee	         d	ee	         d
e
eef         fdZ xZS )VitsEncoderr   c                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        j	        | _	        d S )Nc                 .    g | ]}t                    S r%   )r  )r   r   r   s     r'   r   z(VitsEncoder.__init__.<locals>.<listcomp>`  s"    $g$g$g!%5f%=%=$g$g$gr&   F)
r   r   r   r   r   r   num_hidden_layerslayersgradient_checkpointing	layerdropr   s    `r'   r   zVitsEncoder.__init__]  sh    m$g$g$g$guVMeGfGf$g$g$ghh&+#)r&   Nr   r   r  r  output_hidden_statesreturn_dictr
  c                 B   |rdnd }|rdnd }|t          ||j                  }||z  }t                      pt          |           }	| j        D ]}
|r||fz   }t
          j                            dd          }| j        o
|| j	        k     }|r|	rD| j
        r&| j        r|                     |
j        ||||          }n |
||||          }|d         }|rd}|r||d         fz   }||z  }|r||fz   }|st          d |||fD                       S t          |||          S )Nr%   r   r   )r  r   r  )NNc              3      K   | ]}||V  	d S r.   r%   )r   vs     r'   	<genexpr>z&VitsEncoder.forward.<locals>.<genexpr>  s(      mmq_`_l_l_l_l_lmmr&   )r*   r   r   )r   rW  r   r   r  rK   randomuniformr  r  r  _gradient_checkpointing_func__call__tupler   )r   r   r   r  r  r  r  all_hidden_statesall_self_attentionssynced_gpusencoder_layerdropout_probabilityskip_the_layerlayer_outputss                 r'   r   zVitsEncoder.forwardd  s    #7@BBD$5?bb4 %7H[\\N%4022R6LT6R6R![ 	P 	PM# I$58H$H! #%)"3"3Aq"9"9!]U0Cdn0TN! 1[ 1. 4= $($E$E%.%$&)% %MM %2M%'5%1*;	% % %M !.a 0 - ,  P&9]1=M<O&O#%4 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r&   )NNNN)r   r   r    r   r   r"   r#   r   r  r  r   r   r   r   r   r   s   @r'   r  r  \  s        *z * * * * * * 26,0/3&*B
 B
(B
 'B
 !.	B

 $D>B
 'tnB
 d^B
 
uo%	&B
 B
 B
 B
 B
 B
 B
 B
r&   r  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 ddej	        d	ej
        d
eej	                 dee         dee         dee         deeej	                 ef         fdZ xZS )VitsTextEncoderzs
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    r   c                 $   t                                                       || _        t          j        |j        |j        |j                  | _        t          |          | _
        t          j        |j        |j        dz  d          | _        d S )Nr\   r   )r   )r   r   r   r   	Embedding
vocab_sizer   pad_token_idembed_tokensr  encoderr   r   projectr   s     r'   r   zVitsTextEncoder.__init__  sw    L):F<NPVPcdd"6**y!3V5E5IWXYYYr&   c                     | j         S r.   r  r   s    r'   get_input_embeddingsz$VitsTextEncoder.get_input_embeddings        r&   c                     || _         d S r.   r  )r   r[   s     r'   set_input_embeddingsz$VitsTextEncoder.set_input_embeddings  s    !r&   NT	input_idsr   r  r  r  r  r
  c                    |                      |          t          j        | j        j                  z  }|                     ||||||          }|s|d         n|j        }	|                     |	                    dd                                        dd          |z  }
t          j
        |
| j        j        d          \  }}|s|	||f|dd          z   }|S t          |	|||j        |j                  S )N)r   r   r  r  r  r  r   r   r\   rX   )r*   r+   r,   r   r   )r  r>  rj   r   r   r  r*   r  r1  r"   r   r   r)   r   r   )r   r  r   r  r  r  r  r   encoder_outputsr*   r   r+   r,   rR   s                 r'   r   zVitsTextEncoder.forward  s!    )))44tyAX7Y7YY,,'%)/!5# ' 
 
 7BhOA..Gh.88A>>??II!QOOR^^+0;udk>SYZ+[+[+[(( 	(+7JKo^_^`^`NaaGN$/# 3)7&1
 
 
 	
r&   )NNNT)r   r   r    r!   r   r   r  r  r"   r  r#   r   r  r   r   r)   r   r   r   s   @r'   r  r    s        Zz Z Z Z Z Z Z! ! !" " " 26,0/3&*#
 #
<#
 '#
 !.	#

 $D>#
 'tn#
 d^#
 
uU\"$99	:#
 #
 #
 #
 #
 #
 #
 #
r&   r  c                   (    e Zd ZdZeZdZdZdZd Z	dS )VitsPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    vitsr  Tc                    t          |t          j                  rT|j        j                            d| j        j                   |j         |j        j        	                                 dS dS t          |t          j
                  r?|j        j        	                                 |j        j                            d           dS t          |t          j                  rt          j                            |j                   |j        Yt          j        |j        |j        |j        d         z  z            }t          j                            |j        | |           dS dS t          |t          j                  r]|j        j                            d| j        j                   |j        -|j        j        |j                 	                                 dS dS dS )zInitialize the weightsr@   )r   stdNrW   r   )r   r   )r  r   r  r   datanormal_r   initializer_ranger   zero_r-  fill_r   initkaiming_normal_r>  rj   r&  r   r   uniform_r  padding_idx)r   moduleks      r'   _init_weightsz!VitsPreTrainedModel._init_weights  s   fbi(( 	?M&&CT[5R&SSS{& &&((((( '&-- 	?K""$$$M$$S)))))	** 	?G##FM222{&Ifmv/AFDVWXDY/YZ[[  a 88888 '& -- 	?M&&CT[5R&SSS!-"6#56<<>>>>>	? 	?--r&   N)
r   r   r    r!   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr  r%   r&   r'   r  r    sE         
 L!O&*#? ? ? ? ?r&   r  aI  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`VitsConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z6The complete VITS model, for text-to-speech synthesis.c                   8    e Zd Zdef fdZd Z ee           ee	e
          	 	 	 	 	 	 	 ddeej                 deej                 dee         d	ee         d
ee         dee         deej                 deee         e	f         fd                        Z xZS )	VitsModelr   c                 &   t                                          |           || _        t          |          | _        t          |          | _        t          |          | _        |j	        rt          |          | _        nt          |          | _        |j        dk    r$t          j        |j        |j                  | _        t%          |          | _        |j        | _        |j        | _        |j        | _        |                                  d S rD  )r   r   r   r  text_encoderr  r"  r   decoder"use_stochastic_duration_predictionrL  duration_predictorrn  num_speakersr   r  r   embed_speakerr   posterior_encoderspeaking_raterc  noise_scale_duration	post_initr   s     r'   r   zVitsModel.__init__5  s       +F33-f55	"6**4 	D&Ef&M&MD##&;F&C&CD#""!#f.A6C`!a!aD "6f!=!= $1!-$*$?! 	r&   c                     | j         S r.   )r  r  s    r'   get_encoderzVitsModel.get_encoderO  r  r&   )output_typer  Nr  r  
speaker_idr  r  r  labelsr
  c                 
   ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |(|                    d                                          }n9t          j        |                              d                                          }| j         j	        dk    r|d|cxk    r| j         j	        k     s"n t          d| j         j	        dz
   d          t          |t                    rt          j        d|| j        	          }|                     |                              d          }	nd}	|                     ||||||
          }
|s|
d         n|
j        }|                    dd          }|                    dd          }|s|
d         n|
j        }|s|
d         n|
j        }| j         j        r |                     |||	d| j                  }n|                     |||	          }d| j        z  }t          j        t          j        |          |z  |z            }t          j        t          j        |ddg          d                                          }t          j        |                                |j         |j                  }|                    d          |                    d          k     }|                    d          !                    |j                   }t          j        |d          t          j        |d          z  }|j"        \  }}}}t          j#        |d          $                    ||z  d          }t          j        ||j         |j                  }|                    d          |k     }|!                    |j                   $                    |||          }|tJ          j&        '                    |g d          ddddf         z
  }|                    d                              dd          |z  }t          j(        |)                    d          |                              dd          }t          j(        |)                    d          |                              dd          }|t          j*        |          t          j        |          z  | j+        z  z   }| ,                    |||	d          }||z  }| -                    ||	          } | )                    d          } |t]          j/        | j         j0                  z  }!|s| |!|f|
dd         z   }"|"S tc          | |!||
j2        |
j3                  S )aZ  
        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation.

        Returns:

        Example:

        ```python
        >>> from transformers import VitsTokenizer, VitsModel, set_seed
        >>> import torch

        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

        >>> set_seed(555)  # make deterministic

        >>> with torch.no_grad():
        ...     outputs = model(inputs["input_ids"])
        >>> outputs.waveform.shape
        torch.Size([1, 45824])
        ```
        Nz&Training of VITS is not supported yet.r?   r   r   z Set `speaker_id` in the range 0-.r   )r\  
fill_valuerV  )r  r   r  r  r  r  r\   T)rE   rc  rW   )rW  rV  )r   r   r   r   r   r   r	   r  )r   r   r   r   r   )4r   r  r  use_return_dictNotImplementedError	unsqueezefloatr"   	ones_liker  r`   r  r   fullrV  r  r  r*   r1  r+   r,   r  r  r   r  ceilrM   r`  re   longaranger_   rW  r]  ra   rc   r  r   rN   r<   r  squeezer   rc  r"  r  rK   prodr   r   r   r   )#r   r  r  r%  r  r  r  r&  input_padding_maskspeaker_embeddingstext_encoder_outputr   r+   r,   rk  length_scaledurationpredicted_lengthsindicesoutput_padding_mask	attn_maskr?  r   output_lengthinput_lengthcum_durationvalid_indicespadded_indicesattnprior_latentsri  r   r   r   rR   s#                                      r'   r   zVitsModel.forwardR  s   L 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]%&NOOO%!/!9!9"!=!=!C!C!E!E!&!;!;!E!Eb!I!I!O!O!Q!Q;#a''J,B
====T[%===== !cDKD\_`D`!c!c!cddd*c** ^"ZTjQUQ\]]]
!%!3!3J!?!?!I!I"!M!M!%"//+)/!5# 0 
 
 7Bl+A..GZGl%//155/99!Q??4?d)!,,EXEd<Gt1!44M`Mt;9 		j22"" 5 3  LL  22=BTVhiiLT//:ei558JJ\YZZ!OEIhA,G,GKKPPRR ,04466>O>U^o^vwww%//225F5P5PQR5S5SS1;;A>>AABTBZ[[ O$6::U_M`bd=e=ee	5>_2
A}l|Hb1166zL7PRSTT,}HN8?[[[))!,,|;%((99>>z<Yfgg&):):=J\J\J\)])]^_^_^_adbdad^d)ee''**44Q::YF l4<<??K@@JJ1aPP#l4<<??<OPPZZ[\^_``#e&6{&C&CeiPcFdFd&dgkgw&ww))M+>@R\`)aa 33<<-?@@##A&&,rwt{7Q/R/RR 	!1;?BUVWVXVXBYYGN-#-;*5
 
 
 	
r&   )NNNNNNN)r   r   r    r   r   r#  r   VITS_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r"   r  r   r  r#   r   r   r   r   r   r   s   @r'   r  r  0  sL       
z      4! ! ! +*+@AA?YYY -115$(,0/3&*.2}
 }
EL)}
 !.}
 SM	}

 $D>}
 'tn}
 d^}
 *+}
 
uSz?*	+}
 }
 }
 ZY BA}
 }
 }
 }
 }
r&   r  )Fr9   r:   r:   r:   )Br!   r>  dataclassesr   typingr   r   r   r   numpyrK   r"   torch.utils.checkpointr   activationsr
   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   r   r   r   r   r   configuration_vitsr   
get_loggerr   loggerrF  r   r)   jitscriptr8   rU   rO   Moduler   r   r   r   r  r  r$  r4  rB  rL  rn  r|  r  r  r  r  r  VITS_START_DOCSTRINGrE  r  r%   r&   r'   <module>rX     s      ! ! ! ! ! ! . . . . . . . . . . . .                ! ! ! ! ! ! @ @ @ @ @ @ 7 7 7 7 7 7 B B B B B B        . - - - - - t t t t t t t t t t t t * * * * * * 
	H	%	%  : : : : :k : : :@ : : : : :K : : :>    G  G  G  G TE% E% E%PM5 M5 M5 M5 M5%(/ M5 M5 M5`) ) ) ) )29 ) ) )&; ; ; ; ;29 ; ; ;|U U U U U") U U Up! ! ! ! !	 ! ! !6    	   &+% +% +% +% +%BI +% +% +%\(! (! (! (! (!29 (! (! (!V! ! ! ! !BI ! ! !$a  a  a  a  a bi a  a  a H"% "% "% "% "%BI "% "% "%Jc c c c cBI c c cL' ' ' ' 'bi ' ' 'T$ $ $ $ $ry $ $ $NJ
 J
 J
 J
 J
") J
 J
 J
Z5
 5
 5
 5
 5
bi 5
 5
 5
p? ? ? ? ?/ ? ? ?> " > < ]
 ]
 ]
 ]
 ]
# ]
 ]
	 ]
 ]
 ]
r&   