
    g[                     N   d dl mZ d dlmZmZmZ d dlZd dlZd dlm	Z	 d dl
mZmZ d dlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ  ej        e          Z G d de          Z G d de          Z G d de          Ze G d de                      Z G d de          ZdS )    )	dataclass)OptionalTupleUnionN)CrossEntropyLoss)InstructBlipQFormerConfigInstructBlipVisionConfig)$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutput   )PretrainedConfig)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)logging   )CONFIG_MAPPINGc                       e Zd ZdS )InstructBlipVideoVisionConfigN__name__
__module____qualname__     {/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   )           Dr   r   c                       e Zd ZdS )InstructBlipVideoQFormerConfigNr   r   r   r   r   r   -   r   r   r   c                   V     e Zd ZdZdZ	 	 	 	 	 d
 fd	Zededede	fd	            Z
 xZS )InstructBlipVideoConfiga
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
    ```instructblipvideoN    c                 T    t                      j        di | |i }t                              d           |i }t                              d           |i }t                              d           t	          di || _        t          di || _        d|v r|d         nd}t          |         di || _	        | j	        j
        | _
        | j	        j        | _        || _        || _        | j        j        | j        _        | j	        j        t"          v | _        d| _        d| _        d S )	NzZvision_config is None. initializing the InstructBlipVideoVisionConfig with default values.z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.zTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typeoptg      ?g{Gz?r   )super__init__loggerinfor   vision_configr   qformer_configr   text_configtie_word_embeddingsis_encoder_decodernum_query_tokensvideo_token_indexhidden_sizeencoder_hidden_sizer#   r   use_decoder_only_language_modelinitializer_factorinitializer_range)	selfr)   r*   r+   r.   r/   kwargstext_model_type	__class__s	           r   r&   z InstructBlipVideoConfig.__init__k   sD    	""6""" MKKtuuu!NKKvwwwKKKnooo:KK]KK<NN~NN7C{7R7R+l33X])/:II[II#'#3#G "&"2"E 0!2262D2P//3/?/JNo/o,"%!%r   r)   r*   r+   c                      | d|                                 |                                 |                                 d|S )a  
        Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
        language model configurations.

        Returns:
            [`InstructBlipVideoConfig`]: An instance of a configuration object
        )r)   r*   r+   r   )to_dict)clsr)   r*   r+   r6   s        r    from_vision_qformer_text_configsz8InstructBlipVideoConfig.from_vision_qformer_text_configs   sY      s 
'//11)1133#++--
 
 	
 
 	
r   )NNNr!   N)r   r   r   __doc__r#   r&   classmethodr   r   r   r<   __classcell__)r8   s   @r   r   r   1   s        5 5n %J $& $& $& $& $& $&L 
4
 7
 &	
 
 
 [
 
 
 
 
r   r   c                       e Zd ZdS )4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r   r   rA   rA      s        Dr   rA   c                      e Zd Z	 	 	 	 	 	 	 	 	 	 ddej        dej        deej                 deej                 deej                 deej                 d	eej                 d
ee         dee         deej                 dee         dedee	e
f         fdZ ej                    	 	 	 	 	 ddej        deej                 deej                 deej                 deej                 dedej        fd            ZdS ))InstructBlipVideoForConditionalGenerationNFpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_stateslabelsreturn_dictinterpolate_pos_encodingreturnc           
      |	   ||n| j         j        }|j        \  }}}}}|                    ||z  |||          }|                     |||	||          }|d         }t          j        |                                dd         t
          j        |j	                  }| j
                            |j        d         dd          }t          j        |                                dd         t
          j        |j	                  }|t          j        |          }|                    |d          }|                    |d          }t          j        ||gd          }|                     |||||||	|          }|d         ddd|                    d          ddf         }|                     |          }|                    || j         j        |z  d          }t          j        |                                dd         t
          j        |j	                  } | j                                        |          }|t          j        |          }t)          | j         d	d          N|| j         j        k                        d                              |          }|                                ||<   nzt2                              d
           t          j        ||                    |j	                  gd          }t          j        ||                    |j	                  gd          }| j         j        r|                     ||||	|          }|r|j        n|d         }d}|
|
                    |j	                  }
|dd|
                    d           dddf         }|dddddf                                         } |
dddf                                                             |j	                  }!t?          d          }" |"|                      d| j         j!        j"                  |!                     d                    }n?|                     ||||||	||
          }|r|j#        n|d         }|r|j        n|d         }|s||||f}#||f|#z   n|#S tI          |||||          S )a0
  
        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`List[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```N)rD   rK   rL   rN   rO   r   dtypedevicedim   )rG   rH   query_embedsencoder_hidden_statesencoder_attention_maskrK   rL   rN   r/   K  Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.)inputs_embedsrH   rK   rL   rN   .mean)	reduction)r]   rH   rI   rJ   rK   rL   rN   rM   )losslogitsvision_outputsqformer_outputslanguage_model_outputs)%configuse_return_dictshapereshapevision_modeltorchonessizelongrU   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr.   language_modelget_input_embeddingsgetattrr/   	unsqueeze	expand_asflattenr'   warning_oncetor2   ra   
contiguousr   viewr+   
vocab_sizer`   rA   )$r5   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   
batch_sizeframeschannelheightwidthrb   image_embedsimage_attention_maskrn   query_attention_maskquery_outputsquery_outputlanguage_model_inputslanguage_model_attention_maskr]   special_image_maskoutputsra   r`   shift_logitsshift_labelsloss_fctoutputs$                                       r   forwardz1InstructBlipVideoForConditionalGeneration.forward   sF   N &1%<kk$+B] 6B5G2
FGVU#++J,?&RWXX**%/!5#%= + 
 
 &a(  %z,*;*;*=*=crc*B%*]i]pqqq (//0B10Er2NN$z,*;*;*=*=crc*B%*]i]pqqq!)%*_5F%G%G"-??A?NN!7!I!I&VW!I!X!X!&,@BX+Y_`!a!a!a'1%".#7/!5# % 	
 	
 %Q'+A\->->q-A-A+A111(DE !% 8 8 F F !6 = =j$+JfioJoqs t t(-
!&&(("-UZH]Hd)
 )
 )
% C+@@BB9MM!"_Y77N 4; 3T::F"+t{/L"L!W!WXZ![![!e!efs!t!t0E0M0M0O0OM,--z  
 "I'<m>N>NOdOk>l>l&mstuuuM"Y.0A0AB_Bf0g0ghno  N ;6 "	C))+-"3%9' *  G (3BW^^
FD!6=11FKKNN?#4#4aaa 78%c3B3k2==??%c122g99;;>>v}MM ,f===x 1 1"dk6M6X Y Y[g[l[lmo[p[pqq))+-"3'="3%9' * 	 	G $/>7<<GAJD'2BW^^
F 	FnmWEF)-)9TGf$$vEC))#*
 
 
 	
r   c                 	   t          | d          r|                                  |j        \  }}	}
}}|                    ||	z  |
||          }|                     |d|          j        }t          j        |                                dd         t          j	        |j
                  }| j                            |j        d         dd          }t          j        |                                dd         t          j	        |j
                  }|t          j        |          }|                    |	d          }|                    |	d          }t          j        ||gd	          }|                     |||||d
          }|j        ddd|                    d	          ddf         }|                     |          }|                    || j        j        |	z  d          }t          j        |                                dd         t          j	        |j
                  }|Qt          j        | j        j        j        gg                              |d	                              |j
                  }|t          j        |          } |                                 |          }t5          | j        dd          N|| j        j        k                        d                              |          }|                                ||<   nt>                               d           t          j        ||                    |j
                  gd	          }t          j        ||                    |j
                  gd	          }| j!        j        j"        sQ|#                    dd          |j        d	         z   d	z
  |d<   |#                    dd          |j        d	         z   |d<    | j!        j$        d||d|}| j!        j        j"        s| j        j        j%        d         dk    rdn| j        j        j        }t          j        |gg                              |d	                              |j
                  }tM          |t          j'                  s#t          j        ||j(        gd          |_(        nt          j        ||gd          }|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        hf_device_mapT)rN   rO   NrR   rS   r   rV   rX   )rG   rH   rY   rZ   r[   rN   r/   r\   
max_length   
min_length)r]   rH   LLaMAForCausalLMr   r   ))hasattr_preprocess_acceleraterg   rh   ri   last_hidden_staterj   rk   rl   rm   rU   rn   ro   rp   rq   rr   rs   rt   re   r.   
LongTensorr+   bos_token_idrepeatr|   rv   rw   r/   rx   ry   rz   r'   r{   ru   r-   getgeneratearchitectures
isinstanceTensor	sequences)r5   rD   rE   rF   rG   rH   rO   generate_kwargsr   r   r   r   r   r   r   rn   r   r   r   r   language_attention_maskr]   r   r   r   
bos_tokenss                             r   r   z2InstructBlipVideoForConditionalGeneration.generatel  s   > 4)) 	*''))) 6B5G2
FGVU#++J,?&RWXX((%= ) 
 
 	 	
  %z,*;*;*=*=crc*B%*]i]pqqq(//0B10Er2NN$z,*;*;*=*=crc*B%*]i]pqqq!)%*_5F%G%G"-??A?NN!7!I!I&VW!I!X!X!&,@BX+Y_`!a!a!a'1%".#7 % 
 
 %6qqq:PL<M<Ma<P<P:PRSRSRS7ST $ 8 8 F F !6 = =j$+JfioJoqs t t"'*!&&(("-UZH]Hd#
 #
 #
  4;#:#G"H!IJJ
A&&L'(( 
 !"_Y77N31133I>> 4; 3T::F"+t{/L"L!W!WXZ![![!e!efs!t!t0E0M0M0O0OM,--z  
 "I'<m>N>NOdOk>l>l&mstuuuM"Y(.*;*;<S<Z*[*[\bc  N &-@ v#''b99<Q<WXY<ZZ]^^  - 1@0C0CLRS0T0TWlWrstWu0u-.$%. 
')
 
 
 
 ")< 	C ;*8;?QQQ [,9 
 )L>*:;;BB:qQQTTUaUhiiJgu|44 C$)Iz7;L.MSU$V$V$V!!)Z$9rBBBr   )
NNNNNNNNNF)NNNNF)r   r   r   rj   FloatTensorr   r   boolr   r   rA   r   no_gradr   r   r   r   rC   rC      s       
 >B15598<=A,0/3-1&*).{
 {
'{
 !,{
 !))9 :	{

 E-.{
 !!12{
 $E$45{
 !))9 :{
 $D>{
 'tn{
 )*{
 d^{
 #'{
 
uJJ	K{
 {
 {
 {
z U]__ 9==A0459).A A'A $E$45A !))9 :	A
 E,-A !!12A #'A 
	A A A _A A Ar   rC   ) dataclassesr   typingr   r   r   rj   torch.utils.checkpointtorch.nnr   ;transformers.models.instructblip.configuration_instructblipr   r	   6transformers.models.instructblip.modeling_instructblipr
   r   configuration_utilsr   models.auto.modeling_autor   utilsr   autor   
get_loggerr   r'   r   r   r   rA   rC   r   r   r   <module>r      s    " ! ! ! ! ! ) ) ) ) ) ) ) ) ) )      % % % % % %              
 4 3 3 3 3 3 J J J J J J       ! ! ! ! ! ! 
	H	%	%	 	 	 	 	$< 	 	 		 	 	 	 	%> 	 	 	u
 u
 u
 u
 u
. u
 u
 u
p 	 	 	 	 	;j 	 	 	@ @ @ @ @0T @ @ @ @ @r   