
    gY                    ~   d Z ddlZddlmZmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,  e'j-        e.          Z/dZ0 G d de
j1                  Z2	 ddl3m4Z4 e4Z2e/5                    d           n&# e6$ r Y ne7$ r e/8                    d           Y nw xY w ej9        e2            G d de
j1                  Z: G d de
j1                  Z; G d de
j1                  Z< G d de
j1                  Z= G d de
j1                  Z> G d  d!e          Z?d"Z@d#ZA e#d$e@           G d% d&e?                      ZB G d' d(e
j1                  ZC G d) d*e
j1                  ZD G d+ d,e
j1                  ZE G d- d.e
j1                  ZF G d/ d0e
j1                  ZG G d1 d2e
j1                  ZHd3ZId4ZJd5ZK e#d6eI           G d7 d8e?                      ZL e#d9eI           G d: d;e?e                      ZMdS )<zPix2Struct modeling file    N)DictListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCacheStaticCache)GenerationMixin)AttentionMaskConverter)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS)DUMMY_INPUTS
DUMMY_MASKadd_start_docstrings%add_start_docstrings_to_model_forwardis_torch_fx_proxyis_torchdynamo_compilingloggingreplace_return_docstrings   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfigr!   c                   &     e Zd Zd fd	Zd Z xZS )Pix2StructLayerNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      n/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr)   zPix2StructLayerNorm.__init__:   sD     	l5:k#:#:;; #    c                 h   |                     t          j                                      d                              dd          }|t          j        || j        z             z  }| j        j        t          j	        t          j
        fv r|                     | j        j                  }| j        |z  S )N   T)keepdim)tor+   float32powmeanrsqrtr.   r-   dtypefloat16bfloat16)r/   hidden_statesvariances      r3   forwardzPix2StructLayerNorm.forwardB   s     !##EM2266q99>>r4>PP%Ht?T4T(U(UU ; ???),,T[->??M{]**r4   )r&   __name__
__module____qualname__r)   rC   __classcell__r2   s   @r3   r%   r%   9   sL        $ $ $ $ $ $+ + + + + + +r4   r%   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                   P     e Zd ZdZdeddf fdZdej        dej        fdZ xZ	S )Pix2StructVisionEmbeddingsa-  
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    configreturnNc                 \   t                                                       t          j        |j        |j                  | _        t          j        |j        |j                  | _	        t          j        |j        |j                  | _
        t          j        |j                  | _        d S N)r(   r)   r   Linearpatch_embed_hidden_sizer0   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr/   rM   r2   s     r3   r)   z#Pix2StructVisionEmbeddings.__init__i   s}     "	&*H&J\ ] ]L9KLL!|FNF<NOOz&"566r4   flattened_patchesc                 d   |d d d d df                                          }|d d d d df                                          }|d d d d dd f         }|                     |          }|                     |          }|                     |          }||z   |z   }|                     |          }|S )Nr   r    r6   )longrS   rV   rW   rZ   )r/   r\   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r3   rC   z"Pix2StructVisionEmbeddings.forwardr   s     (111a05577'111a05577-aaaABBh7**+<==
**;77--k::  .0>A
\\*--
r4   )
rE   rF   rG   __doc__r!   r)   r+   TensorrC   rH   rI   s   @r3   rL   rL   b   s|         7/ 7D 7 7 7 7 7 7 %,        r4   rL   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )Pix2StructVisionAttentionc                    t                                                       |j        | _        |j        | _        |j        | _        |j        | _        | j        | j        z  | _	        t          j        | j        | j	        d          | _        t          j        | j        | j	        d          | _        t          j        | j        | j	        d          | _        t          j        | j	        | j        d          | _        d| _        d S NFbias)r(   r)   r0   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutrZ   	inner_dimr   rQ   querykeyvalueoutputgradient_checkpointingr[   s     r3   r)   z"Pix2StructVisionAttention.__init__   s    !-"(+1/(?? Yt/eLLL
9T-t~EJJJYt/eLLL
i0@uMMM&+###r4   NFc                     |j         dd         \  } fd} |                     |                    } |                     |                    }	 |                     |                    }
t	          j        ||	                    dd                    }|t	          j        d j        ||f|j	        |j
                  } j        r j        rd|_        |                                dk    r,||ddddddf                             |j	                  z   }nn|||                    |j	                  z   }nNt!                      s@t	          j        |f|j	        |j
                  }||                    |j	                  z   }d|z
  }|                    |dk    t	          j        |j
                  j                  }||z  }t	          j        |t	          j        t	          j        |j
                  j                            }t.          j                            |dt          j        	                              |          }t.          j                            | j         j        
          }|||z  }t	          j        ||
          }|                    dd                                                              d j                  }                      |          }|f|fz   }|r||fz   }|S )z&
        Self-attention block
        Nr6   c                     |                                                      dj        j                                      dd          S )
projectionr7   r    r6   )
contiguousviewro   rm   	transpose)states
batch_sizer/   s    r3   to_projection_shapez>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s@    $$&&++JDL$Jabbllmnpqrrrr4   r	   r    devicer>   Tr7   )dimr>   ptraining)!shaperr   rs   rt   r+   matmulr|   zerosro   r   r>   rv   r   requires_gradr   r9   r   r,   masked_fillfinfominmaxtensorr   
functionalsoftmaxr:   type_asrZ   rz   r{   rq   ru   )r/   rA   attention_maskposition_biaslayer_head_maskoutput_attentions
seq_lengthr   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr~   s   `               @r3   rC   z!Pix2StructVisionAttention.forward   s$    "/!4RaR!8
J	s 	s 	s 	s 	s 	s +*4::m+D+DEE )(-)@)@AA
**4::m+D+DEE lJ,@,@A,F,FGG !KDL*j9&-W]Wc  M * 3t} 3.2+!!##q(( -qqq$aaa?O0P0S0STaTh0i0i i+ -0A0A-BV0W0W W-// X!&,]5IQ^Qd" " " !.0A0A-BV0W0W W-M,88!9KU[Y_YeMfMfMjkk&&65<FL0I0I0M#N#NOO },,V5=,QQYYZ`aa },,\T\TXTa,bb &'/9Ll<>> "++Aq11<<>>CCJPRTXTbcckk+...M#33 	0/Gr4   )NNNFrD   rI   s   @r3   rg   rg      sb        , , , , ,& M M M M M M M Mr4   rg   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructVisionMlprM   c                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j	        |j
                  | _        t          |j                 | _        d S ri   r(   r)   r   rQ   r0   d_ffwi_0wi_1worX   rY   rZ   r
   dense_act_fnactr[   s     r3   r)   zPix2StructVisionMlp.__init__       If0&+EJJJ	If0&+EJJJ	)FK);%HHHz&"566&-.r4   c                    |                      |                     |                    }|                     |          }||z  }|                     |          }t	          | j        j        t          j                  r]|j	        | j        j        j	        k    rC| j        j        j	        t          j
        k    r$|                    | j        j        j	                  }|                     |          }|S rP   r   r   r   rZ   
isinstancer   r-   r+   re   r>   int8r9   r/   rA   hidden_geluhidden_linears       r3   rC   zPix2StructVisionMlp.forward       hhtyy7788		-00#m3]33 tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r4   )rE   rF   rG   r#   r)   rC   rH   rI   s   @r3   r   r      sT        /5 / / / / / /      r4   r   c                        e Zd Zdeddf fdZ	 	 	 ddej        deej                 deej                 d	ede	e
ej        ej        f         e
ej                 f         f
d
Z xZS )Pix2StructVisionLayerrM   rN   Nc                 >   t                                                       |j        | _        d| _        t	          |          | _        t          |          | _        t          |j	        |j
                  | _        t          |j	        |j
                  | _        d S )Nr    r1   )r(   r)   chunk_size_feed_forwardseq_len_dimrg   	attentionr   mlpr%   r0   layer_norm_epspre_mlp_layer_normpre_attention_layer_normr[   s     r3   r)   zPix2StructVisionLayer.__init__  s    '-'E$26::&v.."5f6HfNc"d"d"d(;F<NTZTi(j(j(j%%%r4   FrA   r   	head_maskr   c                     |}|                      |          }|                     ||||          }|d         }|dd          }||z   }|                     |          }	|                     |	          |z   }	|	f|z   }|S )N)r   r   r   r   r    )r   r   r   r   )
r/   rA   r   r   r   residualself_attention_outputsattention_outputr   layer_outputs
             r3   rC   zPix2StructVisionLayer.forward  s     ! 55mDD!%)%/	 "0 "
 "
 2!4(, )83 ..}==xx--=/G+r4   )NNF)rE   rF   rG   r!   r)   r+   re   r   boolr   r   rC   rH   rI   s   @r3   r   r     s        k/ kD k k k k k k 26,0"' | !. EL)	
   
uU\5</0%2EE	F       r4   r   c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej        deej                 d	eej                 d
ededede	e
ef         fdZ xZS )Pix2StructVisionEncoderrM   rN   Nc                     t                                                       | _        t          j        fdt          j                  D                       | _        d| _        d S )Nc                 .    g | ]}t                    S  )r   ).0_rM   s     r3   
<listcomp>z4Pix2StructVisionEncoder.__init__.<locals>.<listcomp>4  s"    #k#k#ka$9&$A$A#k#k#kr4   F)	r(   r)   rM   r   
ModuleListrangenum_hidden_layerslayerrv   r[   s    `r3   r)   z Pix2StructVisionEncoder.__init__1  sa    ]#k#k#k#k5QWQiKjKj#k#k#kll
&+###r4   FTrA   r   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }t          | j                  D ]j\  }	}
|r||fz   }|||	         nd }| j        r&| j        r|                     |
j        ||||          }n |
||||          }|d         }|r||d         fz   }k|r||fz   }|st          d |||fD                       S t          |||          S )Nr   r   r    c              3      K   | ]}||V  	d S rP   r   r   vs     r3   	<genexpr>z2Pix2StructVisionEncoder.forward.<locals>.<genexpr>]  s(      mmq_`_l_l_l_l_lmmr4   last_hidden_staterA   
attentions)	enumerater   rv   r   _gradient_checkpointing_func__call__tupler   )r/   rA   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_moduler   layer_outputss                r3   rC   zPix2StructVisionEncoder.forward7  sN    #7@BBD$5?bb4(44 	P 	POA|# I$58H$H!.7.CillO* 	pt} 	p $ A A )!"#%! ! !-]NO]n o o)!,M  P&9]1=M<O&O# 	E 1]4D D 	nmm]4EGZ$[mmmmmm++*
 
 
 	
r4   )NNFFT)rE   rF   rG   r!   r)   r+   re   r   r   r   r   r   rC   rH   rI   s   @r3   r   r   0  s        ,/ ,D , , , , , , 26,0"'%* +
 +
|+
 !.+
 EL)	+

  +
 #+
 +
 
uo%	&+
 +
 +
 +
 +
 +
 +
 +
r4   r   c                   @    e Zd ZdZeZdZdZed             Z	d Z
d ZdS )Pix2StructPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    TFc                 v    t          j        t                    }t          j        t                    }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r+   r   r   r   )r/   r   
input_maskdummy_inputss       r3   r   z&Pix2StructPreTrainedModel.dummy_inputso  s=    L..	\*--
!*"&0
 

 r4   c                    | j         j        }t          |t                    r$|j        j                            |dz             dS t          |t                    rt          | j         t                    r| j         j	        j
        n| j         j
        }t          | j         t                    r| j         j	        j        n| j         j        }|j        j        j                            d||dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d||dz  z             t          |j        d          r/|j        j        #|j        j        j                                         |j        j        j                            d||dz  z             t          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |t&                    rt          | j         t                    r| j         j	        j
        n| j         j
        }t          | j         t                    r| j         j	        j        n| j         j
        }t          | j         t                    r| j         j	        j        n| j         j        }|j        j        j                            d|||z  dz  z             |j        j        j                            d||dz  z             |j        j        j                            d||dz  z             |j        j        j                            d|||z  dz  z             |j        r.|j        j        j                            d||dz  z             dS dS t          |t8          j                  rt          | j         t                    r| j         j	        j
        n| j         j
        }|j        j                            d||dz  z             |j        +|j        j        |j                                                  dS dS t          |t>                    ret          | j         t                    r| j         j	        j
        n| j         j
        }|j         j        j                            d||dz  z             dS t          |t8          j!        t8          j"        f          rt8          j#        $                    |j        j        %                    tL          j'                  d| j         j(                  %                    |j        j)                  |j        _        |j         |j        j                                         dS dS t          |t                    r*|j        !|j        j                            d           dS dS t          |t8          j                  r]|j        j                            d| j         j(                   |j        -|j        j        |j                                                  dS dS dS )zInitialize the weights      ?        g      )r<   stdrk   N)*rM   initializer_factorr   r%   r-   datafill_ Pix2StructTextDenseGatedActDenser!   text_configr0   r   r   normal_hasattrrk   zero_r   r   Pix2StructTextAttentionrl   	num_headsrr   rs   rt   ru   has_relative_attention_biasrelative_attention_biasr   rT   padding_idxPix2StructTextModellm_headrQ   Conv2dinittrunc_normal_r9   r+   r:   initializer_ranger>   )r/   modulefactorr0   r   rm   ro   s          r3   _init_weightsz'Pix2StructPreTrainedModel._init_weightsz  s   /f122 J	?M$$Vc\22222 @AA H	? dk+;<<-'33[, 
 4>dkK[3\3\r4;*//bfbmbrDK#++&[UYDY:Z+[[[v{F++ .0@0L %++---K#++&[UYDY:Z+[[[v{F++ .0@0L %++---I!))s4D.8Q)RRRvy&)) ,fin.H	#))+++++, ,.H.H 788 7	?
 dk+;<<-'33[,  1;4;HX0Y0Yv',,_c_j_v 
 dk+;<<+'11[*  L$,,#6kTfFfkoEo;p,qqqJ"**;PTCT9U*VVVL$,,#6[RVEV;W,XXXM %--3FwQcGchlFl<m-nnn1 s.5:BBQW\glp[pQqBrrrrrs s--  	? dk+;<<-'33[,  M&&CVPT?T5U&VVV!-"6#56<<>>>>> .- 344 	? dk+;<<-'33[,  N!&..CVX\G\=].^^^^^BI 677 	? "$!6!6"%%em443DKDa "7 " "b$%% M {& &&((((( '& 344 	?}("((----- )(-- 	?M&&CT[5R&SSS!-"6#56<<>>>>>	? 	?--r4   c                    | j         j        }| j         j        }|t          d          t	          |          rHt          j        |j        d d         dz   |          }t          j        ||dd df         gd          }nD|	                    |j                  }|dd df         
                                |ddd f<   ||d<   |t          d          |                    |d	k    |           |S )
Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information.r7   )r    .r   r    ).r   z1self.model.config.pad_token_id has to be defined.)rM   decoder_start_token_idpad_token_id
ValueErrorr   r+   fullr   cat	new_zerosclonemasked_fill_)r/   r   r  r	  shifted_input_idss        r3   _shift_rightz&Pix2StructPreTrainedModel._shift_right  s   !%!C{/!)<   Y'' 	? %
9?3B3+?$+FH^ _ _ %	+<iSbS>Q*RXZ [ [ [ ) 3 3IO D D)238)<)B)B)D)Dc122g&(>f%PQQQ&&'8D'@,OOO  r4   N)rE   rF   rG   rd   r!   config_class_supports_cache_class_supports_static_cachepropertyr   r  r  r   r4   r3   r   r   e  sl         
 $L "  XM? M? M?`! ! ! ! !r4   r   aL  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Pix2StructConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
            paper](https://arxiv.org/abs/2210.03347) (figure 5) for more details.

        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zjThe bare Pix2StructVision Model transformer outputting raw hidden-states without any specific head on top.c                   X    e Zd ZeZdZdZdgZdef fdZ	d Z
deeee         f         dd	fd
Z ee           eee          	 	 	 	 	 	 ddeej                 deej                 deej                 dee         dee         dee         deeef         fd                        Z xZS )Pix2StructVisionModelr\   Tr   rM   c                    t                                          |           || _        t          |          | _        t          |          | _        t          |j        |j	                  | _
        |                                  d S Nr   )r(   r)   rM   rL   ra   r   encoderr%   r0   r   	layernorm	post_initr[   s     r3   r)   zPix2StructVisionModel.__init__  sr       4V<<.v66,V-?VEZ[[[ 	r4   c                     | j         j        S rP   )ra   rS   r/   s    r3   get_input_embeddingsz*Pix2StructVisionModel.get_input_embeddings"  s    //r4   heads_to_prunerN   Nc                     |                                 D ]/\  }}| j        j        |         j                            |           0dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   prune_heads)r/   r   r   headss       r3   _prune_headsz"Pix2StructVisionModel._prune_heads%  sU    
 +0022 	C 	CLE5Lu%/;;EBBBB	C 	Cr4   output_typer  r   r   r   r   r   c                    ||n| j         j        }||n| j         j        }||n| j         j        }|t	          d          |,|                    d          dk                                    }|                     || j         j                  }| 	                    |          }| 
                    ||||||          }|d         }	|                     |	          }	|s|	f}
|
|dd         z   S t          |	|j        |j                  S )	a.  
        Returns:

        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, Pix2StructVisionModel

        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 2048, 768]
        ```
        Nz%You have to specify flattened_patchesr7   r  r   )r   r   r   r   r   r    r   )rM   r   r   use_return_dictr
  sumfloatget_head_maskr   ra   r  r  r   rA   r   )r/   r\   r   r   r   r   r   embedding_outputencoder_outputssequence_outputhead_outputss              r3   rC   zPix2StructVisionModel.forward-  sM   H 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]$DEEE!/333;;q@GGIIN &&y$+2OPP	??+<==,,)/!5# ' 
 
 *!,..99 	6+-L/!"""555-)7&1
 
 
 	
r4   )NNNNNN)rE   rF   rG   r#   r  main_input_namesupports_gradient_checkpointing_no_split_modulesr!   r)   r  r   intr   r%  r   "PIX2STRUCT_VISION_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr   r+   re   r   r   r   rC   rH   rI   s   @r3   r  r    s       
 *L)O&*#01
/ 
 
 
 
 
 
0 0 0C4T#Y+? CD C C C C +*+MNN+ETcddd 5915,0,0/3&*K
 K
#EL1K
 !.K
 EL)	K

 $D>K
 'tnK
 d^K
 
u00	1K
 K
 K
 ed ONK
 K
 K
 K
 K
r4   r  c                   *     e Zd Zdef fdZd Z xZS )r   rM   c                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j	        |j
                  | _        t          |j                 | _        d S ri   r   r[   s     r3   r)   z)Pix2StructTextDenseGatedActDense.__init__  r   r4   c                    |                      |                     |                    }|                     |          }||z  }|                     |          }t	          | j        j        t          j                  r]|j	        | j        j        j	        k    rC| j        j        j	        t          j
        k    r$|                    | j        j        j	                  }|                     |          }|S rP   r   r   s       r3   rC   z(Pix2StructTextDenseGatedActDense.forward  r   r4   rE   rF   rG   r"   r)   rC   rH   rI   s   @r3   r   r   ~  sT        /3 / / / / / /      r4   r   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructTextLayerFFrM   c                     t                                                       t          |          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S r  )r(   r)   r   DenseReluDenser%   r0   layer_norm_epsilon
layer_normr   rX   rY   rZ   r[   s     r3   r)   zPix2StructTextLayerFF.__init__  s[    >vFF-f.@fF_```z&"566r4   c                     |                      |          }|                     |          }||                     |          z   }|S rP   )r@  r>  rZ   )r/   rA   forwarded_statess      r3   rC   zPix2StructTextLayerFF.forward  sF    ??=99../?@@%5E(F(FFr4   r:  rI   s   @r3   r<  r<    sT        73 7 7 7 7 7 7      r4   r<  c                   r     e Zd Z	 ddedee         f fdZedd	            Zdd
Z		 	 	 	 	 	 	 	 	 ddZ
 xZS )r   FNrM   	layer_idxc                 *   t                                                       || _        |j        | _        |j        | _        |j        | _        |j        | _        |j        | _	        |j
        | _        | j	        | j        z  | _        || _        |(t                              d| j        j         d           t%          j        | j        | j        d          | _        t%          j        | j        | j        d          | _        t%          j        | j        | j        d          | _        t%          j        | j        | j        d          | _        | j        r$t%          j        | j        | j	                  | _        t5                      | _        d| _        d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Frj   )r(   r)   r   relative_attention_num_bucketsrelative_attention_max_distancer0   rl   rm   r   ro   rY   rZ   rq   rD  loggerwarning_oncer2   rE   r   rQ   rr   rs   rt   ru   rT   r   setpruned_headsrv   r/   rM   r   rD  r2   s       r3   r)   z Pix2StructTextAttention.__init__  sn    	+F(.4.S+/5/U,!-"(+'*(??",4>+B , , ,   Yt/1ANNN
9T-t/?eLLLYt/1ANNN
i 0$2BOOO+ 	k+-<8[]a]i+j+jD(EE&+###r4   T       c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r6   r    )r9   r+   r^   absr   
zeros_likelogr+  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r3   _relative_position_bucketz1Pix2StructTextAttention._relative_position_bucket  s>   .  	cAK!2Q!6 : :5: F F TT %	*; < <!&+<e>NO`>a>a!b!b b  1$	$y0 &/I'--//);<<h|i/001Y&( "UZ..	&"
 &+Y&8RT_bcTc(d(d&
 &
" 	EK2CE_```r4   c                    || j         j        j        }t          j        |t          j        |          dddf         }t          j        |t          j        |          dddf         }||z
  }|                     |d| j        | j                  }|                      |          }|	                    g d          
                    d          }|S )z%Compute binned relative position biasN)r>   r   F)rW  rX  rY  )r6   r   r    r   )r   r-   r   r+   aranger^   r^  rF  rG  permute	unsqueeze)	r/   query_length
key_lengthr   context_positionmemory_positionrV  relative_position_bucketvaluess	            r3   compute_biasz$Pix2StructTextAttention.compute_bias  s    >18?F <EJvVVVWXWXWXZ^W^_,zFSSSTXZ[Z[Z[T[\+.>>#'#A#A;=	 $B $
 $
  --.FGG			**44Q77r4   c                    |j         dd         \  }}|du}|                     |                                          }|                    |d| j        | j                                      dd          }|0|j                            | j	                  }|r|j
        }n|j        }|r|n|}|r)|r'|r%|j        | j	                 }|j        | j	                 }n|                     |                                          }|                     |                                          }|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|9|s|
nd}
|                    ||| j	        d|
i          \  }}|rd|j        | j	        <   t#          j        ||                    dd                    }|||
d         dz   n|}||n|j         d         }| j        s@t#          j        d| j        ||f|j        |j                  }| j        r| j        rd|_        n|                     |||j        	          }|||z   }| j        rUt#          j        |j         d                   }d
|t;          | j                  <   |dd|                                f         }n|}||z  }t>          j         !                    |"                                d          #                    |          }t>          j         $                    || j$        | j                  }|||z  }t#          j        ||          }|                    dd                                                              |d| j%                  }| &                    |          }|f|fz   |fz   }|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr6   r7   r    cache_positionTr	   r   r   r   r  r   )'r   rr   rz   r{   ro   rm   r|   
is_updatedgetrD  cross_attention_cacheself_attention_cache	key_cachevalue_cachers   rt   updater+   r   r   r   r   r>   rv   r   r   ri  rK  r,   listr   r   r   r   r+  r   rZ   rq   ru   )r/   rA   maskkey_value_statesr   past_key_valuer   rc  	use_cacher   rk  r~   r   is_cross_attentionr   rm  current_statesr   r   r   real_seq_lengthrd  r   r   r   r   s                             r3   rC   zPix2StructTextAttention.forward  s#   $ "/!4RaR!8
J .T9zz-00;;==#((RtG^__iijkmnoo%'266t~FFJ! E!/!E!/!D .@R))] 	E. 	EZ 	E'1$.AJ)5dnELL.11<<>>J::n55@@BBL#RtG^__iijkmnooJ',,ZT\4KbccmmnoqrssL)7I!St+9+@+@dn?OQ_>`, ,(
L & E@DN-dn= lJ,@,@A,F,FGG 8D8LnR0144R^O,<,DJZJ`abJcJ3 e %ozB6=`f`l! ! ! . 74= 726M/ $ 1 1/:V\Vc 1 d d - 4 	1:m1!455D,-Dd'(()#0DIIKK#@  #0 &&},,V\\^^,DDLLVTT },,\T\TXTa,bb &'/9Ll<>>!++Aq11<<>>CCJPRTXTbcckk+...N#447GG 	0/Gr4   FN)TrM  rN  rP   )	NNNNNNFFN)rE   rF   rG   r"   r   r4  r)   staticmethodr^  ri  rC   rH   rI   s   @r3   r   r     s        jn, ,*,ZbcfZg, , , , , ,> -  -  -  \- `   ( e e e e e e e er4   r   c                   H     e Zd Zddee         f fdZ	 	 	 	 	 	 	 ddZ xZS ) Pix2StructTextLayerSelfAttentionFNrD  c                     t                                                       t          |||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )Nr   rD  r   r(   r)   r   r   r%   r0   r?  r@  r   rX   rY   rZ   rL  s       r3   r)   z)Pix2StructTextLayerSelfAttention.__init__x  sl    00KW`
 
 
 .f.@fF_```z&"566r4   c	           
          |                      |          }	|                     |	|||||||          }
||                     |
d                   z   }|f|
dd          z   }|S )N)ru  r   r   rw  rx  r   rk  r   r    r@  r   rZ   )r/   rA   r   r   r   rw  rx  r   rk  normed_hidden_statesr   r   s               r3   rC   z(Pix2StructTextLayerSelfAttention.forward  s      $}==>> '+)/) * 	
 	
 &5Ea5H(I(II "%5abb%99r4   r|  )NNNNFFNrE   rF   rG   r   r4  r)   rC   rH   rI   s   @r3   r  r  w  sy        7 7XVY] 7 7 7 7 7 7        r4   r  c                   J     e Zd Zddee         f fdZ	 	 	 	 	 	 	 	 ddZ xZS )!Pix2StructTextLayerCrossAttentionNrD  c                     t                                                       t          |d|          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )NFr  r   r  )r/   rM   rD  r2   s      r3   r)   z*Pix2StructTextLayerCrossAttention.__init__  sc    0UZfoppp-f.@fF_```z&"566r4   Fc                     |                      |          }|                     |||||||||	|

  
        }||                     |d                   z   }|f|dd          z   }|S )N)	ru  rv  r   r   rw  rx  rc  r   rk  r   r    r  )r/   rA   rv  r   r   r   rw  rx  rc  r   rk  r  r   r   r   s                  r3   rC   z)Pix2StructTextLayerCrossAttention.forward  s      $}==>> -'+)%/) * 
 
 %t||4DQ4G'H'HH/$4QRR$88r4   rP   )NNNNFNFNr  rI   s   @r3   r  r    s{        7 7(3- 7 7 7 7 7 7        r4   r  c                   R     e Zd Zddee         f fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )	Pix2StructTextBlockFNrD  c                     t                                                       t          |||          | _        t	          |          | _        t          |          | _        d S )Nr  )r(   r)   r  self_attentionr  encoder_decoder_attentionr<  r   rL  s       r3   r)   zPix2StructTextBlock.__init__  s_    >(C
 
 
 *K6)R)R&(00r4   Tc                    |                      |||||	|
||          }|d d         \  }}	|dd          }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }|d u}|r| 	                    ||||||	|d         dz   |
||
  
        }|d d         \  }}	|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }||dd          z   }| 
                    |          }|j        t          j        k    r_t          j        |                                          r9t          j        |j                  j        dz
  }t          j        || |          }|f}|
r
||	fz   |z   }n||z   }|S )N)r   r   r   rw  rx  r   rk  r6   i  )r   r   r7   r    )	rv  r   r   r   rw  rc  rx  r   rk  )r  r>   r+   r?   isinfanyr   r   clampr  r   )r/   rA   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskrw  rx  r   r   rk  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                       r3   rC   zPix2StructTextBlock.forward  sF     "&!4!4)'+)/) "5 	"
 	"
 )?rr(B%~21226 %-//EK4N4N4R4R4T4T/+m&9::>EK!KK<[YYYM2$> 	P&*&D&D!65; :-+B/!3#"3- 'E ' '# -DBQB,G)M> "em33M8R8R8V8V8X8X3#k-*=>>BTI %M|Q\ ] ] ] !24KABB4O O // %-//EK4N4N4R4R4T4T/+m&9::>EK!KK<[YYYM " 	2 114EEGG 11Gr4   r|  )NNNNNNNNFFTNr  rI   s   @r3   r  r    s        1 1XVY] 1 1 1 1 1 1  "#&*#'I I I I I I I Ir4   r  a  

    The Pix2Struct model was proposed in [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language
    Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu,
    Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova. It's an encoder decoder
    transformer pre-trained in a image-to-text setting.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config (Union[`Pix2StructConfig`, `Pix2StructTextConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
            embeddings so you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
            Training](./t5#training).
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
                `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention layers. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
            input (see `past_key_values`). This is useful if you want more control over how to convert
            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
            of `inputs_embeds`.

        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
            cache in the correct position and to infer the complete sequence length.
a  
    Args:
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`

            The process of flattening the pixel patches is done by `Pix2StructProcessor`.

        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
                `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention layers. Can be used to speed up decoding.

            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
            input (see `past_key_values`). This is useful if you want more control over how to convert
            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.

            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
            of `inputs_embeds`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder.

        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z)The standalone text decoder of Pix2Structc            $           e Zd ZeZdgZdgZdZ fdZd Z	d Z
d Zd Zd	 Z ee           eee
          	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%deej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   dee         dee         dee         deej                 dee         deej                 deeej        df         ef         fd                        Zdej        dej        dej        dedef
dZedej        ded ed!ej         d"ej!        dej        d#efd$            Z" xZ#S )&r   r  zlm_head.weightTc                     t                                                     t          j        j        j                  | _        t          j        fdt          j	                  D                       | _
        t          j        j                  | _        t          j        j                  | _        t          j        j        j        d          | _        |                                  d| _        d S )Nc           	      V    g | ]%}t          t          |d k              |          &S )r   r  )r  r   )r   r   rM   s     r3   r   z0Pix2StructTextModel.__init__.<locals>.<listcomp>  sD        $FQRSV`abbb  r4   r   Frj   )r(   r)   r   rT   
vocab_sizer0   embed_tokensr   r   
num_layersr   r%   r?  final_layer_normrX   rY   rZ   rQ   r   r  rv   r[   s    `r3   r)   zPix2StructTextModel.__init__  s       L):F<NOO]   v011  
 

 !4F4FFLe f f fz&"566y!3V5FUSSS 	&+###r4   c           	         |t                               d           |S d}|D ]}d}|D ]4}||                    d|                    |j                            fz   }5|d         j        |d         j        k    r,t          d|d         j         d|d         j         d          t          |          t          |          k    r0t          dt          |           dt          |           d          ||fz   }|S )	NzHYou might want to consider setting `use_cache=True` to speed up decodingr   r   z%reordered_layer_past_states[0] shape z  and layer_past_states[0] shape z mismatchedz&length of reordered_layer_past_states z! and length of layer_past_states )rH  warningindex_selectr9   r   r   r
  len)r/   past_key_valuesbeam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r3   _reorder_cachez"Pix2StructTextModel._reorder_cache  s    "NNefff""!#!0 	] 	] +-'$5   .I$11!X[[AQAX5Y5YZZM /++ +1-37H7K7QQQ  j<WXY<Z<`  j  j  CT  UV  CW  C]  j  j  j   .//37H3I3III  dSA\=]=]  d  d  AD  EV  AW  AW  d  d  d   &<?Z>\%\""%%r4   c                     | j         S rP   r  r  s    r3   r  z(Pix2StructTextModel.get_input_embeddings)  s      r4   c                     || _         d S rP   r  r/   new_embeddingss     r3   set_input_embeddingsz(Pix2StructTextModel.set_input_embeddings,  s    *r4   c                     | j         S rP   r   r  s    r3   get_output_embeddingsz)Pix2StructTextModel.get_output_embeddings/  
    |r4   c                     || _         d S rP   r  r  s     r3   set_output_embeddingsz)Pix2StructTextModel.set_output_embeddings2  s    %r4   r&  Nr   r   r  r  inputs_embedsr   cross_attn_head_maskr  rx  r   r   labelsr   rk  rN   .c                 &   |	|	n| j         j        }	|
|
n| j         j        }
||n| j         j        }||n| j         j        }||t          d          |1|                                }|                    d|d                   }n.||                                dd         }nt          d          |&| j        
J d            |                     |          }|\  }}d}d}|	s|t          |t                    r4t          |t                    sd}t          |t                                }npt          |t                    s1d}t                              d           t          j        |          }n*|(t          t                      t                                }d	}|	|d	         }n||                                }|t#          j        |||z   |j        
          }|7||                                |z   n|}t#          j        |||j        
          }| j         j        r#|                     |||||j        nd|
          }nO|ddddddf         }|                    |j                  }d|z
  t#          j        |j                  j        z  }|O|                                \  }}}||f}|t#          j        ||j        
          }|                     |          }nd}|                     || j         j                  }|                     || j         j                  }|rdnd}|
rdnd}|
rdnd}d} d}!|                     |          }"tA          | j!                  D ]\  }#}$||#         }%||#         }&|r||"fz   }| j"        rL| j#        rE|	rt          $                    d           d}	| %                    |$j&        |"|| |||!|%|&d|	|
|          }'n |$|"|| |||!|%|&||	|
|          }'|	du r|'dd         dz   |'dd         z   }'|'dd         \  }"}(|'d         } ||'|
rdnd         }!|
r||'d         fz   }|||'d         fz   }| '                    |"          }"|                     |"          }"| (                    |"          })|r||"fz   }d}*||                    |)j                  }tS          j*        dd          }+ |+|)+                                                    d|)                    d                    |+                                                    d                    }*|	r|(nd},|r|j        },|r|,                                },|st[          d |*|)|,|||fD                       S t]          |*|)|,|||          S )a  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoProcessor, Pix2StructTextModel

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        ```
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer7   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddingsFTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   rl  )r>   r   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...)r   r   r  r  r  r   r  rw  rx  r   rk  r    rP   r6      r	      r  r<   )ignore_index	reductionc              3      K   | ]}||V  	d S rP   r   r   s     r3   r   z.Pix2StructTextModel.forward.<locals>.<genexpr>  s4         =  !=== r4   )losslogitsr  rA   r   cross_attentions)/rM   rx  r   r   r)  r
  sizer{   r  r   r   r   r   rH  rI  from_legacy_cacheget_seq_lengthr+   r`  r   r,   
is_decoder_update_causal_maskrp  r9   r>   r   r   invert_attention_maskr,  r  rZ   r   r   rv   r   r  r   rC   r  r   r   CrossEntropyLossrz   to_legacy_cacher   r   )-r/   r   r   r  r  r  r   r  r  rx  r   r   r  r   rk  kwargsinput_shaper~   r   return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthcausal_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   r  rA   r   r   r   r  r   next_decoder_cacher  r  loss_fct
next_caches-                                                r3   rC   zPix2StructTextModel.forward5  s'   H "+!6IIDK<Q	1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B] ]%>sttt"#..**K!r;r?;;II&',,..ss3KKdeee $002p000 --i88M!,
J $&+# 	V3/511 V*_Vi:j:j V.2+"5o|~~"V"V1DEE 	V&*###`  
 #6"G"X"X ("5lnnlnn"U"U!"%%3A%6""(%4%C%C%E%E"!"\&(>(KTaTh  N ! BQA\..00:==bl  #Z
OML`aaaN;! 	U228G8S44Y]! KK )D$)9:K%..}/B.CCK,M<O0P0P0TTK !,=R=W=W=Y=Y: 7$68O#P %-).4HQ^Qe)f)f)f&.2.H.HI_.`.`++.2+ &&y$+2HII	#112FH^__"6@BBD0:d&7Brrd(,%]33(44 <	V <	VOA|'lO)=a)@&# I$58H$H!* #t} # &NNt   !&I $ A A (!!)31#.%"! !  !-!#."/*?+J2O$3/I#2'&7#1! ! !" E!! -bqb 1G ;mABB>O O0=bqb0A-M-
 *!,M$00=CT>[aaZ[0\-  V!/=3C2E!E(4+?=QRCSBU+U(--m<<]33m,,   	E 1]4D DYYv}--F*OOOH8F--//44RRII6K\K\K^K^KcKcdfKgKghhD+4>''$
& 	>(=J 	;(88::J 	   %"(      1&+%1
 
 
 	
r4   input_tensorc           
         | j         j        dk    r
|d|v r|S d S ||                                nd}t          |t                    }| j         j        dk    r#|s!|st          j        |||| j                  rd S |j        |j	        }	}|j
        d         }
|r|                                }n/t          |t          j                  r|j
        d         n||
z   dz   }|                     ||
|||	||j
        d                   }| j         j        dk    rB|@|j	        j        d	k    r0|s.t          j        |          j        }t          j        ||          }|S )
Nflash_attention_2r   r   sdpa)r  r  is_trainingr    r7   )sequence_lengthtarget_lengthr>   r   rk  r~   cuda)rM   _attn_implementationr  r   r   r   _ignore_causal_mask_sdpar   r>   r   r   get_max_cache_shaper+   re   5_prepare_4d_causal_attention_mask_with_cache_positiontyper   r   _unmask_unattended)r/   r   r  rk  r  r   past_seen_tokensusing_static_cacher>   r   r  r  r  	min_dtypes                 r3   r  z'Pix2StructTextModel._update_causal_mask#  s    ;+/BBB)c^.C.C%%4
 @O?Z?99;;;`a'EE ;+v55>P5Yj5%>*'7 M	    t$*L,?v&,Q/ 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*f44% 5 E**.I0CKQZ[[Kr4   r  r  r>   r   r~   c                    | |                                  dk    r| }n+t          j        |          j        }	t          j        ||f|	||          }|dk    rt          j        |d          }|t          j        ||          |                    dd          k    z  }|ddddddf                             |ddd          }| |	                                }| j
        d         }
|ddddddd|
f         | ddddddf         z   }|dk    }|ddddddd|
f                             ||	          |ddddddd|
f<   |S )	a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuer>   r   r    )diagonalrl  r7   r   )r   r+   r   r   r  triur`  reshapeexpandr  r   r   )r   r  r  r>   r   rk  r~   r  r  r  mask_lengthpadding_masks               r3   r  zIPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_positiond  s   D %.*<*<*>*>!*C*C(KKE**.I* -0Ye\b  K !###jqAAA5<fEEEH^H^_acdHeHeeeK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdd+q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r4   )NNNNNNNNNNNNNN)$rE   rF   rG   r"   r  r3  _tied_weights_keysr2  r)   r  r  r  r  r  r    PIX2STRUCT_TEXT_INPUTS_DOCSTRINGr   r   r6  r   r+   
LongTensorFloatTensorre   r   r   r   rC   r   r  r}  r4  r>   r   r  rH   rI   s   @r3   r   r     s       
 (L./*+&*#, , , , ,(& & &<! ! !+ + +  & & & +*+KLL+L[jkkk 156:=A>B48157;EI$(,0/3-1&*59i
 i
E,-i
 !!23i
  ((9:	i

 !)): ;i
   01i
 E-.i
 'u|4i
 "%e.?(@"ABi
 D>i
 $D>i
 'tni
 )*i
 d^i
 !!12i
" 
uU&+,.OO	P#i
 i
 i
 lk MLi
X?? l? 	?
 ?  ? ? ? ?B 555 5 {	5
 5 5 5 5 5 \5 5 5 5 5r4   r   zhA conditional generation model with a language modeling head. Can be used for sequence generation tasks.c            '           e Zd ZeZdZdgZdef fdZd Zd Z	de
j        fdZd	 Zd dee         de
j        fdZd Zd Z ee           eee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d!deej                 deej                 deej                 deej                 deej                 deej                 deej                 deeeej                                   deeeej                                   deej                 deej                 dee         dee         dee         dee         deej                 de eej                 ef         f"d                        Z! xZ"S )""Pix2StructForConditionalGenerationr\   zdecoder.lm_head.weightrM   c                     t                                          |           t          |j                  | _        t          |j                  | _        |j        | _        | 	                                 d S rP   )
r(   r)   r  vision_configr  r   r   decoderis_vqar  r[   s     r3   r)   z+Pix2StructForConditionalGeneration.__init__  s`       ,V-ABB*6+=>>m 	r4   c                 4    | j                                         S rP   )r  r  r  s    r3   r  z7Pix2StructForConditionalGeneration.get_input_embeddings  s    |00222r4   c                 :    | j                             |           d S rP   )r  r  r  s     r3   r  z7Pix2StructForConditionalGeneration.set_input_embeddings  s    )).99999r4   rN   c                 4    | j                                         S rP   )r  r  r  s    r3   r  z8Pix2StructForConditionalGeneration.get_output_embeddings  s    |11333r4   c                 :    | j                             |           d S rP   )r  r  r  s     r3   r  z8Pix2StructForConditionalGeneration.set_output_embeddings  s    **>:::::r4   Nnew_num_tokensc                 \    | j                             |          }|| j        j        _        |S rP   )r  resize_token_embeddingsrM   r   r  )r/   r  model_embedss      r3   r  z:Pix2StructForConditionalGeneration.resize_token_embeddings  s,    |;;NKK .<*r4   c                     | j         S rP   )r  r  s    r3   get_decoderz.Pix2StructForConditionalGeneration.get_decoder  r  r4   c                     | j         S rP   )r  r  s    r3   get_encoderz.Pix2StructForConditionalGeneration.get_encoder  r  r4   r&  r   r   r   r   decoder_head_maskr  r.  r  r  decoder_inputs_embedsrx  r   r   r   rk  c                    ||n| j         j        j        }||n| j         j        }||                     ||||||          }ne|rct          |t                    sNt          |d         t          |          dk    r|d         ndt          |          dk    r|d         nd          }|d         }|
W|U|S|                     |
          }||n0|	                    | j         j
                                                  }d|dddf<   |                     ||||	||||||||
||          }|s||z   S t          |j        |j        |j        |j        |j        |j        |j        |j        |j        	  	        S )	a  
        Returns:

        Example:

        Inference:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> # autoregressive generation
        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A stop sign is on a street corner.

        >>> # conditional generation
        >>> text = "A picture of"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A picture of a stop sign with a red stop sign
        ```

        Training:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A stop sign is on the street corner."

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> labels = processor(text=text, return_tensors="pt").input_ids

        >>> # forward pass
        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> print(f"{loss.item():.5f}")
        5.94282
        ```N)r\   r   r   r   r   r   r   r    r6   r   )r   r   r  r  r  r  r   r  rx  r   r   r  r   rk  )	r  r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)rM   r   rx  r)  r  r   r   r  r  ner	  r+  r  r   r  r  r  rA   r   r  r   )r/   r\   r   r   r   r   r  r  r.  r  r  r  rx  r   r   r   rk  rA   decoder_outputss                      r3   rC   z*Pix2StructForConditionalGeneration.forward  s   ` "+!6IIDK<S<]	%0%<kk$+B] ""ll"3-#"3%9' +  OO  	O_!M!M 	-"1!"4474H4H14L4Loa00RV14_1E1E1I1I?1--t  O (*"3";@U@] $ 1 1& 9 9 *5 '&&))$+*BCCIIKK # ,-"111a4( ,,'1/+"/#1'!5/!5#) ' 
 
"  	5"_44 %")+;"1"?.9,=&5&G"1"?.9

 

 

 
	
r4   rP   )NNNNNNNNNNNNNNNN)#rE   rF   rG   r!   r  r1  r  r)   r  r  r   Moduler  r  r   r4  rT   r  r
  r  r   PIX2STRUCT_INPUTS_DOCSTRINGr   r   r6  r+   r  r  
BoolTensorre   r   r   r   rC   rH   rI   s   @r3   r  r    s       
 $L)O23	/ 	 	 	 	 	 	3 3 3: : :4ry 4 4 4 4; ; ; hsm r|         +*+FGG+=O\\\ :>6:8<=A159=7;EIEI-18<$(,0/3&*59#N
 N
#E$56N
 !!23N
 $E$45	N

 !))9 :N
 E-.N
 $E$56N
 'u|4N
 "%e.?(@"ABN
 "%e.?(@"ABN
 )*N
  (5N
 D>N
 $D>N
 'tnN
  d^!N
" !!12#N
$ 
uU&');;	<%N
 N
 N
 ]\ HGN
 N
 N
 N
 N
r4   r  )Nrd   rS  typingr   r   r   r   r   r+   torch.utils.checkpointr   activationsr
   cache_utilsr   r   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   r   r   configuration_pix2structr!   r"   r#   
get_loggerrE   rH  r6  r  r%   apex.normalizationrJ   infoImportError	Exceptionr  appendrL   rg   r   r   r   r   !PIX2STRUCT_VISION_START_DOCSTRINGr5  r  r   r<  r   r  r  r  PIX2STRUCT_START_DOCSTRINGr  r  r   r  r   r4   r3   <module>r,     s      5 5 5 5 5 5 5 5 5 5 5 5 5 5            ! ! ! ! ! ! P P P P P P P P P P P P ) ) ) ) ) ) > > > > > >              . - - - - - 1 1 1 1 1 1	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 e d d d d d d d d d 
	H	%	% %+ + + + +") + + +2	//////&
KKijjjj 	 	 	D 	 	 	
NN_```D	   / 0 0 0! ! ! ! ! ! ! !H^ ^ ^ ^ ^	 ^ ^ ^D    ")   :( ( ( ( (BI ( ( (V2
 2
 2
 2
 2
bi 2
 2
 2
j~! ~! ~! ~! ~! ~! ~! ~!B	% !& "6 p% j
 j
 j
 j
 j
5 j
 j
	 j
\    ry   :    BI    H H H H Hbi H H HX! ! ! ! !ry ! ! !J# # # # #	 # # #LW W W W W") W W Wt ,`$  DV r / j j j j j3 j j	 jZ n z
 z
 z
 z
 z
)BO z
 z
	 z
 z
 z
s   B= =C C C 