
    g                     2    d Z ddlmZ  G d de          ZdS )z$Speech processor class for SpeechT5.   )ProcessorMixinc                   B     e Zd ZdZdZdZ fdZd Zd Zd Z	d Z
 xZS )	SpeechT5Processora}  
    Constructs a SpeechT5 processor which wraps a feature extractor and a tokenizer into a single processor.

    [`SpeechT5Processor`] offers all the functionalities of [`SpeechT5FeatureExtractor`] and [`SpeechT5Tokenizer`]. See
    the docstring of [`~SpeechT5Processor.__call__`] and [`~SpeechT5Processor.decode`] for more information.

    Args:
        feature_extractor (`SpeechT5FeatureExtractor`):
            An instance of [`SpeechT5FeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`SpeechT5Tokenizer`):
            An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
    SpeechT5FeatureExtractorSpeechT5Tokenizerc                 L    t                                          ||           d S )N)super__init__)selffeature_extractor	tokenizer	__class__s      l/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/speecht5/processing_speecht5.pyr
   zSpeechT5Processor.__init__%   s$    *I66666    c                 h   |                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }||t          d          ||t          d          ||||t          d	          | | j        |g|R d|i|}n| | j        |fi |}nd}| | j        |||d
|}	|	d         }
n| | j        |fi |}	|	d         }
nd}	||	S |	!|
|d<   |	                    d          }|||d<   |S )a  
        Processes audio and text input, as well as audio and text targets.

        You can process audio by using the argument `audio`, or process audio targets by using the argument
        `audio_target`. This forwards the arguments to SpeechT5FeatureExtractor's
        [`~SpeechT5FeatureExtractor.__call__`].

        You can process text by using the argument `text`, or process text labels by using the argument `text_target`.
        This forwards the arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.__call__`].

        Valid input combinations are:

        - `text` only
        - `audio` only
        - `text_target` only
        - `audio_target` only
        - `text` and `audio_target`
        - `audio` and `audio_target`
        - `text` and `text_target`
        - `audio` and `text_target`

        Please refer to the docstring of the above two methods for more information.
        audioNtexttext_targetaudio_targetsampling_ratez\Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?z\Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?zaYou need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process.)r   r   input_values	input_idslabelsattention_maskdecoder_attention_mask)pop
ValueErrorr   r   get)r   argskwargsr   r   r   r   r   inputstargetsr   r   s               r   __call__zSpeechT5Processor.__call__(   s   0 

7D))zz&$''jj55zz.$77

?D99!1n   #(?n   =\1dl{GZs   +T+E`D````Y_``FF#T^D33F33FFF#,d,,]juuntuuG^,FF$$dn[;;F;;G[)FFG>N%F8%,[[1A%B%B"%13I/0r   c                    |                     dd          }|                     dd          }|                     dd          }||t          d          |||t          d          | | j        j        |g|R i |}n| | j        j        |fi |}nd}|d|v st          |t                    r&d|d         v r | j        j        |fi |}|d         }nO| j        j        }| j        j        | j        _         | j        j        |g|R i |}|| j        _        |d         }nd}||S |!||d<   |	                    d          }	|	|	|d	<   |S )
au  
        Collates the audio and text inputs, as well as their targets, into a padded batch.

        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].

        Valid input combinations are:

        - `input_ids` only
        - `input_values` only
        - `labels` only, either log-mel spectrograms or text tokens
        - `input_ids` and log-mel spectrogram `labels`
        - `input_values` and text `labels`

        Please refer to the docstring of the above two methods for more information.
        r   Nr   r   z:Cannot process both `input_values` and `input_ids` inputs.zZYou need to specify either an `input_values`, `input_ids`, or `labels` input to be padded.    r   r   )
r   r   r   padr   
isinstancelistfeature_sizenum_mel_binsr   )
r   r   r    r   r   r   r!   r"   feature_size_hackr   s
             r   r&   zSpeechT5Processor.pado   s   " zz.$77JJ{D11	Hd++#	(=YZZZI$5&.l   #/T+/NtNNNvNNFF"'T^'	<<V<<FFFf$$FD)A)A$kU[\]U^F^F^,$.,V>>v>> -$($:$G!6:6L6Y&34$04VMdMMMfMM6G&3 0G>N%F8%,[[1A%B%B"%13I/0r   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r   batch_decoder   r   r    s      r   r-   zSpeechT5Processor.batch_decode   s    
 +t~*D;F;;;r   c                 &     | j         j        |i |S )z
        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r   decoder.   s      r   r0   zSpeechT5Processor.decode   s    
 %t~$d5f555r   )__name__
__module____qualname____doc__feature_extractor_classtokenizer_classr
   r#   r&   r-   r0   __classcell__)r   s   @r   r   r      s          9)O7 7 7 7 7E E EN: : :x< < <6 6 6 6 6 6 6r   r   N)r4   processing_utilsr   r    r   r   <module>r:      s]    + * . . . . . .c6 c6 c6 c6 c6 c6 c6 c6 c6 c6r   