
    gx!                     z    d dl mZmZ ddlmZ ddlmZ  e            rddlmZ ddl	m
Z
 dZ G d	 d
e          ZdS )    )ListUnion   )is_torch_available   )Pipeline)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc                   v     e Zd ZdZddd fd
Zd Zd Zdeee	e         f         f fdZ
	 	 	 dd	Zd
 Z xZS )TextToAudioPipelinea5  
    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
    pipeline generates an audio file from an input text and optional other conditional inputs.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(model="suno/bark-small")
    >>> output = pipe("Hey it's HuggingFace on the phone!")

    >>> audio = output["audio"]
    >>> sampling_rate = output["sampling_rate"]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    <Tip>

    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
    [`TextToAudioPipeline.__call__.generate_kwargs`].

    Example:

    ```python
    >>> from transformers import pipeline

    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")

    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
    >>> generate_kwargs = {
    ...     "do_sample": True,
    ...     "temperature": 0.7,
    ...     "max_new_tokens": 35,
    ... }

    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
    ```

    </Tip>

    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
    `"text-to-audio"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
    N)vocodersampling_ratec                z    t                      j        |i | | j        dk    rt          d          d | _        | j        j        t          j                    v r?|6t          j
        t                                        | j        j                  n|| _        || _        | j        | j        j        j        | _        | j        t| j        j        }| j        j                            dd           }|'|                    |                                           dD ]}t)          ||d           }||| _        d S d S )Ntfz5The TextToAudioPipeline is only available in PyTorch.generation_config)sample_rater   )super__init__	framework
ValueErrorr   model	__class__r	   valuesr
   from_pretrainedDEFAULT_VOCODER_IDtodevicer   config__dict__getupdateto_dictgetattr)	selfr   r   argskwargsr   
gen_configsampling_rate_namer   s	           `/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.pyr   zTextToAudioPipeline.__init__L   sM   $)&)))>T!!TUUU:#H#O#Q#QQQ ?  /0BCCFFtzGXYYY L +<#!%!4!BD% Z&F,001DdKKJ%j0022333&F 7 7" '0BD I I ,)6D& &%7 7    c                     t          |t                    r|g}| j        j        j        dk    r=| j        j                            dd          ddddd}|                    |           |} | j	        |fi |dd	i}|S )
Nbarkmax_input_semantic_length   FT
max_length)r/   add_special_tokensreturn_attention_maskreturn_token_type_idspaddingreturn_tensorspt)

isinstancestrr   r   
model_typer   semantic_configr    r!   	tokenizer)r$   textr&   
new_kwargsoutputs        r)   
preprocesszTextToAudioPipeline.preprocessk   s    dC   	6D:'611 #4DHHIdfijj&+)-).' J f%%%FDDDDtDDDr*   c                     |                      || j                  }|d         }|d         }| j                                        rT|                      || j                  }d|vr
| j        |d<   |                    |            | j        j        di ||}nHt          |          r$t          d|	                                            | j        di ||d         }| j
        | 
                    |          }|S )N)r   forward_paramsgenerate_kwargsr   a\  You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty.
                                 For forward-only TTA models, please use `forward_params` instead of of
                                 `generate_kwargs`. For reference, here are the `generate_kwargs` used here:
                                 r    )_ensure_tensor_on_devicer   r   can_generater   r!   generatelenr   keysr   )r$   model_inputsr&   r@   rA   r=   s         r)   _forwardzTextToAudioPipeline._forward   s:   ..vdk.JJ 01 !23:""$$ 	E";;OTXT_;``O #/997;7M 34 !!/222(TZ(JJ<J>JJFF?##  = #2"6"6"8"8= =    TZAA,A.AA!DF<#\\&))Fr*   text_inputsc                 8     t                      j        |fi |S )a  
        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

        Args:
            text_inputs (`str` or `List[str]`):
                The text(s) to generate.
            forward_params (`dict`, *optional*):
                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
                underlying model.
            generate_kwargs (`dict`, *optional*):
                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                complete overview of generate, check the [following
                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
                only passed to the underlying model if the latter is a generative model.

        Return:
            A `dict` or a list of `dict`: The dictionaries have two keys:

            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
        )r   __call__)r$   rJ   r@   r   s      r)   rL   zTextToAudioPipeline.__call__   s$    ,  uww>>~>>>r*   c                 2    |r|ni |r|ni d}|i }i }|||fS )N)r@   rA   rB   )r$   preprocess_paramsr@   rA   paramspostprocess_paramss         r)   _sanitize_parametersz(TextToAudioPipeline._sanitize_parameters   sJ     1?FnnB2AIr
 

 $ " &*<<<r*   c                 
   i }t          |t                    r	|d         }nt          |t                    r|d         }|                                                                                                |d<   | j        |d<   |S )Nwaveformr   audior   )r6   dicttuplecpufloatnumpyr   )r$   rS   output_dicts      r)   postprocesszTextToAudioPipeline.postprocess   s}    h%% 	#
+HH%(( 	#{H'||~~3355;;==G'+'9O$r*   )NNN)__name__
__module____qualname____doc__r   r>   rI   r   r7   r   rL   rQ   r[   __classcell__)r   s   @r)   r   r      s        . .` '+$ 7 7 7 7 7 7 7>  .     D?E#tCy.$9 ? ? ? ? ? ?4 	= = = ="	 	 	 	 	 	 	r*   r   N)typingr   r   utilsr   baser   models.auto.modeling_autor	   !models.speecht5.modeling_speecht5r
   r   r   rB   r*   r)   <module>rf      s            & & & & & &        DQQQQQQCCCCCC1 { { { { {( { { { { {r*   