
    g؉                       d Z ddlZddlZddlZddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlZddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2  ej3        e4          Z5dZ6dZ7dZ8dZ9e G d de.                      Z:dOdZ;dPdZ<	 	 	 	 	 dQdZ= G d d ej>                  Z? G d! d"e)          Z@ G d# d$ej>                  ZA G d% d&ej>                  ZB G d' d(ej>                  ZC G d) d*ej>                  ZD G d+ d,ej>                  ZE G d- d.ej>                  ZF G d/ d0ej>                  ZG G d1 d2ej>                  ZH G d3 d4ej>                  ZIeEjJ        ZKd5ZL G d6 d7ej>        e          ZM G d8 d9eM          ZN G d: d;eM          ZO G d< d=eM          ZP G d> d?ej>                  ZQeNeOePd@ZRdAedBeeM         fdCZS G dD dEe@          ZT e/dFe7           G dG dHe@                      ZU e/dIe7           G dJ dKe@e%                      ZV e/dLe7           G dM dNe@                      ZWdS )RzPyTorch UDOP model.    N)ABCabstractmethod)deepcopy)	dataclass)AnyDictOptionalSequenceTupleUnion)Tensornn)CrossEntropyLoss)
UdopConfig)Seq2SeqLMOutputSeq2SeqModelOutput   )ACT2FN)CacheDynamicCacheEncoderDecoderCacheStaticCache)GenerationMixin)AttentionMaskConverter)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardis_torchdynamo_compilingreplace_return_docstringsr   a7  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Args:
        config ([`UdopConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UDOP is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for detail.
            [What are input IDs?](../glossary#input-ids)

        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)

        bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size,
            config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height /
            config.patch_size) * (width / config.patch_size))`.

        visual_bbox (`torch.LongTensor` of shape `(batch_size, patch_sequence_length, 4)`, *optional*):
            Bounding boxes of each patch in the image. If not provided, bounding boxes are created in the model.

        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
            [What are decoder input IDs?](../glossary#decoder-input-ids) T5 uses the `pad_token_id` as the starting
            token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
            `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare
            `decoder_input_ids` for pretraining take a look at [T5 Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the encoder. Mask values selected in `[0,
            1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
                `[0, 1]`:
                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.
        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
            Tuple consists of (`last_hidden_state`, `optional`: *hidden_states*, `optional`: *attentions*)
            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` is a sequence of hidden states at
            the output of the last layer of the encoder. Used in the cross-attention of the decoder.
        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
            input (see `past_key_values`). This is useful if you want more control over how to convert
            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. If
            `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value of
            `inputs_embeds`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
            cache in the correct position and to infer the complete sequence length.
ap  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

        bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size,
            config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height /
            config.patch_size) * (width / config.patch_size))`.

        visual_bbox (`torch.LongTensor` of shape `(batch_size, patch_sequence_length, 4)`, *optional*):
            Bounding boxes of each patch in the image. If not provided, bounding boxes are created in the model.

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
c                      e Zd ZU dZdZej        ed<   dZej        ed<   dZ	e
eeej                                   ed<   dZe
eej                          ed<   dZe
eej                          ed<   dZe
eej                          ed<   dS )	 BaseModelOutputWithAttentionMaska
  
    Class for the model's outputs that may also contain a past key/values (to speed up sequential decoding). Includes
    an additional attention mask.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model. If `past_key_values` is used only
            the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
        when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`. Contains pre-computed hidden-states (key and values in the
            self-attention blocks and optionally if `config.is_encoder_decoder=True` in the cross-attention blocks)
            that can be used (see `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or
        when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when
        `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and
        `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
    Nlast_hidden_stateattention_maskpast_key_valueshidden_states
attentionscross_attentions)__name__
__module____qualname____doc__r%   torchFloatTensor__annotations__r&   r'   r	   r   r(   r)   r*        b/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/udop/modeling_udop.pyr$   r$      s          B ,0u(///(,NE%,,,AEOXeE%*;$<=>EEE8<M8E%"345<<<59Ju012999;?huU%678?????r3   r$         c           	         | |z  | |z  g}t          j        dd|d         dz   z  d          }||d         z  }t          j        dd|d         dz   z  d          }||d         z  }t          j        |d d                             |d         d          |d d                             |d         d                              dd          |dd                              |d         d          |dd                              |d         d                              dd          gd          }|                    dd          }|S )Nr         ?   dim   )r/   arangestackrepeat	transposeview)
image_size
patch_sizeimage_feature_pool_shapevisual_bbox_xvisual_bbox_yvisual_bbox_inputs         r4   get_visual_bboxrI   	  sg    *j 8*
:RSLC+CA+F+J$KSQQM-a00MLC+CA+F+J$KSQQM-a00M#2#%%&>q&A1EE#2#%%&>q&A1EEOOPQSTUU!""$$%=a%@!DD!""$$%=a%@!DDNNqRSTT		
    *..r155r3   c                 H   t          | t          j                  r| j        d         }n#t	          |           }t          j        |           } ||z
  }|dk    rCt          j        |g|z                                |           }t          j        | |gd          } | d |         S )Nr   r;   )	
isinstancer/   r   shapelentensorr?   tocat)seq
target_len	pad_valuenmrets         r4   pad_sequencerW      s    #u|$$  IaLHHl3QA1uuk9+/**--c22ic
***{
{r3      c	                 B    |}	t          j        t          j        dddddf         dddddf         z   dz  |	z                                            d|	dz
            }
t          j        t          j        dddddf         dddddf         z   dz  |	z                                            d|	dz
            |	z  }|
|z   }                    t           j                                      d          dk                        d          d	k    z  }t          j         d|                    d          	                    dd 
                    d                              }d||<   ||z  }t          j         dddddf         d
                                          t          j        t          j        t          |                    dddf         	                    d|
                    d                    dddddf                             |          |dddddf         gd          }|                    dd          }t#          | \  }}d||f<    fdt%          t                              D             }ht'          ||                              d          	                     
                    d          dd                               j                  fdt%          t                              D             fdD             }dk    r 
                    d          n|
                    d          z
  t          j         fd|D                       }t          j        fdD                       !t          j        fd|D                       }t          j        ||gd          }t          j        gd          t          j        |gd          |fS )a  
    Combine the image and text embeddings for the input to the encoder/decoder of UDOP.

    First, the image embeddings are created by checking for each visual patch if it is inside the bounding box of a
    token. If it is, the visual patch is combined with the token embedding. Then, the visual bounding boxes are combined
    with the text bounding boxes. Finally, the visual bounding boxes are combined with the text attention mask.
    Nr      g       @r9   r   r:           r8   Tr;   Fc                 8    g | ]}|         |                  S r2   r2   ).0iimage_embeddings
patch_indss     r4   
<listcomp>z1combine_image_text_embeddings.<locals>.<listcomp>^  s(    ___1,Q/
1>___r3   )rC   rD   c                 8    g | ]}|         |                  S r2   r2   )r]   r^   r`   visual_bboxs     r4   ra   z1combine_image_text_embeddings.<locals>.<listcomp>e  s&    QQQQ;q>*Q-0QQQr3   c                     g | ]:}t          j        d gt          |          z                                          ;S )r9   )r/   rN   rM   rO   )r]   itemr&   s     r4   ra   z1combine_image_text_embeddings.<locals>.<listcomp>g  s>     j j jVZqcCIIo!>!>!A!A.!Q!Q j j jr3   c           
      b    g | ]+}t          |t          j        d                              ,S )r   r   rW   r/   
zeros_like)r]   re   r_   max_lens     r4   ra   z1combine_image_text_embeddings.<locals>.<listcomp>n  s7    pppSWdGU%56Ft6L%M%M	N	Npppr3   c           
      b    g | ]+}t          |t          j        d                              ,S rg   rh   )r]   re   bboxrj   s     r4   ra   z1combine_image_text_embeddings.<locals>.<listcomp>p  s6    qqq]a|D'5;KDQUJ;W;WXXqqqr3   c           
      b    g | ]+}t          |t          j        d                              ,S rg   rh   )r]   re   r&   rj   s     r4   ra   z1combine_image_text_embeddings.<locals>.<listcomp>s  s6    sssUY\$)9.:N)O)OPPsssr3   )r/   clipfloorlongrO   float64meangather	unsqueezer@   size	full_likeboolrP   r>   rM   flattenziprangerI   devicer?   )r_   inputs_embedsrl   rc   r&   num_patchesrj   rC   rD   sequence_lengthocr_points_xocr_points_y
ocr_points
target_segrepeated_vision_embedsindrowscolsinput_vision_patchesvisual_attention_maskinputs_vision_patchesr`   s   ` ``` `              @r4   combine_image_text_embeddingsr   -  s   & "O:T!!!QQQ']T!!!QQQ']2c9OKLLQQSSUVXgjkXk L 	
5;QQQ1WQQQ1W =DVWW\\^^`acruvcvww
	  ,J775=!!D))B--3&499R==C+?@J"\!Z11"55<<QCSCXCXY[C\C\]]  *-:&++M!1!!!QQQ'!:DAAFFHHJ
)LZ))!!!T'299!Z__R=P=PQQRSRSRSUVUVUVX\R\]``akllqqq!!!Tz"	
   C ++a

CcJD$"JtTz_____cR\ooH^H^___%
SSS!++A..556F6K6KA6N6NPQSTUU!nn%5%<==QQQQQ%J:P:PQQQK! j j j j^i j j j!||"''**M..q111!Kppppp[oppp  +qqqqqepqqqrrK! %sssss]rsss!
 !
 I}.CDaHHM9dK(!,,D!N4I#JANN$..r3   c                   (     e Zd ZdZ fdZd Z xZS )UdopPatchEmbeddingsz2D Image to Patch Embeddingsc                    t                                                       |j        |j        }}|j        |j        }}t          |t          j        j	                  r|n||f}t          |t          j        j	                  r|n||f}|d         |d         z  |d         |d         z  z  }|| _        || _        || _        || _
        t          j        ||||          | _        d S )Nr9   r   )kernel_sizestride)super__init__rC   rD   num_channelshidden_sizerK   collectionsabcIterabler}   r   Conv2dproj)selfconfigrC   rD   r   r   r}   	__class__s          r4   r   zUdopPatchEmbeddings.__init__  s    !'!2F4EJ
$*$79Kk#-j+/:R#S#SqZZZdfpYq
#-j+/:R#S#SqZZZdfpYq
!!}
15*Q-:VW=:XY$$(&IlKZXbccc			r3   c                 B   |j         \  }}}}|| j        d         k    s|| j        d         k    r2t          d| d| d| j        d          d| j        d          d	          |                     |          }|                    d                              dd          }|S )Nr   r9   zInput image size (*z) doesn't match model (z).rZ   )rL   rC   
ValueErrorr   rx   rA   )r   pixel_values
batch_sizer   heightwidth
embeddingss          r4   forwardzUdopPatchEmbeddings.forward  s    2>2D/
L&%T_Q'''5DOA4F+F+FAV A Ae A A_Q'A A*./!*<A A A   YY|,,
''**44Q::
r3   )r+   r,   r-   r.   r   r   __classcell__r   s   @r4   r   r   }  sR        &&d d d d d	 	 	 	 	 	 	r3   r   c                   8    e Zd ZdZeZdZdZdZdZ	dgZ
d Zd ZdS )	UdopPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. Based on `T5PreTrainedModel`.
    transformerTFwoc                 J   | j         j        }t          |t                    r$|j        j                            |dz             dS t          |t          j                  rU|j        j        	                    d|           |j
        +|j        j        |j
                                                  dS dS t          |t          j                  rt          j                            |j        j                            t           j                  d|                              |j        j                  |j        _        |j         |j        j                                         dS dS t          |t(                    rF| j         j        }| j         j        }|j        j        j        	                    d||dz  z             dS t          |t.                    r+|j        j        j        	                    d|dz             dS t          |t2                    rKt5          |d          r7| j         j        s-|j        j        j        	                    d|dz             dS dS dS t          |t:                    r|j        j        j        	                    d|| j         j        dz  z             t5          |j        d          r/|j        j        #|j        j        j                                         |j        j        j        	                    d|| j         j         dz  z             t5          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |tB                    rt|j"        j        j        	                    d|| j         j        dz  z             t5          |j"        d          r/|j"        j        #|j"        j        j                                         |j#        j        j        	                    d|| j         j        dz  z             t5          |j#        d          r/|j#        j        #|j#        j        j                                         |j        j        j        	                    d|| j         j         dz  z             t5          |j        d          r1|j        j        '|j        j        j                                         dS dS dS t          |tH                    r| j         j        }| j         j%        }| j         j&        }|j'        j        j        	                    d|||z  dz  z             |j(        j        j        	                    d||dz  z             |j)        j        j        	                    d||dz  z             |j*        j        j        	                    d|||z  dz  z             |j+        r0|j        j        j        	                    d||dz  z             dS dS dS )zInitialize the weightsr8   r[   )rr   stdN      lm_headbias),r   initializer_factorrK   UdopLayerNormweightdatafill_r   	Embeddingnormal_padding_idxzero_r   inittrunc_normal_rO   r/   float32dtyper   RelativePositionBiasBased_modelrelative_attention_bias	UdopModelsharedUdopForConditionalGenerationhasattrtie_word_embeddingsr   UdopDenseActDensewir   d_ffUdopDenseGatedActDensewi_0wi_1UdopAttentiond_kv	num_headsqkvohas_relative_attention_bias)r   modulefactorr   key_value_proj_dimn_headss         r4   _init_weightsz!UdopPreTrainedModel._init_weights  s   /fm,, 8	oM$$Vc\22222-- 6	oM&&CV&<<<!-"6#56<<>>>>> .-	** 2	o "$!6!6v}7I7L7LU]7[7[bekq!6!r!r!u!u#" "FM {& &&((((( '& 899 *	o[3Fk)G*16>>CVX_dhWhMi>jjjjj	** &	o M %--3FSL-IIIII <== "	ovy)) O$+2Q O%*22#2NNNNNO O O O 122 	o I!))s4;CV[_B_8`)aaavy&)) ,fin.H	#))+++I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H 677 	oK#++&T[EX]aDa:b+cccv{F++ .0@0L %++---K#++&T[EX]aDa:b+cccv{F++ .0@0L %++---I!))s4;CSX\B\8])^^^vy&)) ,fin.H	#))+++++, ,.H.H.. 	o k)G!%!1k+GHO ((cv'L^B^cgAg7h(iiiHO ((cv$7O(PPPHO ((cv$7O(PPPHO ((cv'L^B^cgAg7h(iii1 o.5:BBQW\chl[lQmBnnnnn	o 	oo or3   c                    | j         j        }| j         j        }|
J d            |                    |j                  }|dd df                                         |ddd f<   ||d<   |
J d            |                    |dk    |           t          j        |dk              	                                s
J d	            |S )
Nzself.model.config.decoder_start_token_id has to be defined. In Udop it is usually set to the pad_token_id. See Udop docs for more information.r:   r9   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
r   decoder_start_token_idpad_token_id	new_zerosrL   clonemasked_fill_r/   allre   )r   	input_idsr   r   shifted_input_idss        r4   _shift_rightz UdopPreTrainedModel._shift_right  s    !%!C{/%11@ 211 &//	@@%.sCRCx%8%>%>%@%@#qrr'"$:&!'')\'''&&'8D'@,OOOy*a/005577ss9ssss  r3   N)r+   r,   r-   r.   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_cache_class_supports_static_cache_keep_in_fp32_modulesr   r   r2   r3   r4   r   r     sg         
 L%&*# "!F;o ;o ;o|! ! ! ! !r3   r   c                   &     e Zd Zd fd	Zd Z xZS )r   ư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )ze
        Construct a layernorm module in the Udop style. No bias and no subtraction of mean.
        N)r   r   r   	Parameterr/   onesr   variance_epsilon)r   r   epsr   s      r4   r   zUdopLayerNorm.__init__  sD     	l5:k#:#:;; #r3   c                 h   |                     t          j                                      d                              dd          }|t          j        || j        z             z  }| j        j        t          j	        t          j
        fv r|                     | j        j                  }| j        |z  S )NrZ   r:   T)keepdim)rO   r/   r   powrr   rsqrtr   r   r   float16bfloat16)r   r(   variances      r4   r   zUdopLayerNorm.forward  s     !##EM2266q99>>r4>PP%Ht?T4T(U(UU ; ???),,T[->??M{]**r3   )r   r+   r,   r-   r   r   r   r   s   @r4   r   r     sL        $ $ $ $ $ $+ + + + + + +r3   r   c                   *     e Zd Zdef fdZd Z xZS )r   r   c                 J   t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j	                  | _
        t          |j                 | _        d S NFr   )r   r   r   Linearr   r   r   r   Dropoutdropout_ratedropoutr   dense_act_fnactr   r   r   s     r4   r   zUdopDenseActDense.__init__  sx    )FNFKeDDD)FKeDDDz&"566&-.r3   c                    |                      |          }|                     |          }|                     |          }t          | j        j        t          j                  r]|j        | j        j        j        k    rC| j        j        j        t          j	        k    r$|
                    | j        j        j                  }|                     |          }|S N)r   r  r   rK   r   r   r/   r   r   int8rO   )r   r(   s     r4   r   zUdopDenseActDense.forward   s    ..//]33tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r3   r+   r,   r-   r   r   r   r   r   s   @r4   r   r     sS        /z / / / / / /      r3   r   c                   *     e Zd Zdef fdZd Z xZS )r   r   c                    t                                                       t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j        |j        |j        d          | _        t          j	        |j
                  | _        t          |j                 | _        d S r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  s     r4   r   zUdopDenseGatedActDense.__init__0  s    IfnfkFFF	IfnfkFFF	)FKeDDDz&"566&-.r3   c                    |                      |                     |                    }|                     |          }||z  }|                     |          }t	          | j        j        t          j                  r]|j	        | j        j        j	        k    rC| j        j        j	        t          j
        k    r$|                    | j        j        j	                  }|                     |          }|S r  )r  r   r   r   rK   r   r   r/   r   r   r  rO   )r   r(   hidden_geluhidden_linears       r4   r   zUdopDenseGatedActDense.forward8  s    hhtyy7788		-00#m3]33 tw~u|44	C#tw~';;;$
22),,TW^-ABBM..r3   r  r   s   @r4   r   r   /  sS        /z / / / / / /      r3   r   c                   *     e Zd Zdef fdZd Z xZS )UdopLayerFFr   c                 $   t                                                       |j        rt          |          | _        nt          |          | _        t          |j        |j                  | _	        t          j        |j                  | _        d S )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r  s     r4   r   zUdopLayerFF.__init__N  sx     	<"8"@"@D"3F";";D'F<UVVVz&"566r3   c                     |                      |          }|                     |          }||                     |          z   }|S r  )r  r  r   )r   r(   forwarded_statess      r4   r   zUdopLayerFF.forwardX  sF    ??=99../?@@%5E(F(FFr3   r  r   s   @r4   r  r  M  sS        7z 7 7 7 7 7 7      r3   r  c                   z     e Zd Z	 	 ddedee         f fdZd Zedd
            Z	ddZ
	 	 	 	 	 	 	 	 	 ddZ xZS )r   FNr   	layer_idxc                 P   t                                                       |j        | _        || _        |j        | _        |j        | _        |j        | _        |j        | _        |j	        | _
        |j        | _        | j
        | j        z  | _        || _        |/| j        r(t                              d| j        j         d           t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        t'          j        | j        | j        d          | _        | j        r$t'          j        | j        | j
                  | _        t7                      | _        d| _        d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderr   relative_attention_num_bucketsrelative_attention_max_distancer   r   r   r   r   r   r   	inner_dimr  loggerwarning_oncer   r+   r   r   r   r   r   r   r   r   setpruned_headsgradient_checkpointingr   r   r   r  r   s       r4   r   zUdopAttention.__init__a  su    	 ++F(.4.S+/5/U,~"(+'*(??",4>+B , , ,   4<eDDD4<eDDD4<eDDD4>4<eDDD+ 	k+-<8[]a]i+j+jD(EE&+###r3   c                    t          |          dk    rd S t          || j        | j        | j                  \  }}t          | j        |          | _        t          | j        |          | _        t          | j        |          | _        t          | j	        |d          | _	        | j        t          |          z
  | _        | j        | j        z  | _
        | j                            |          | _        d S )Nr   r9   r;   )rM   r   r   r   r   r   r   r   r   r   r  union)r   headsindexs      r4   prune_headszUdopAttention.prune_heads  s    u::??F74<!8$:K
 
u $DFE22#DFE22#DFE22#DFEq999|c%jj004<? -33E::r3   T       c                 P   d}|rC|dz  }|| dk                         t          j                  |z  z  }t          j        |           } n(t          j        | t          j        |                      } |dz  }| |k     }|t          j        |                                 |z            t          j        ||z            z  ||z
  z                       t          j                  z   }t          j        |t          j	        ||dz
                      }|t          j
        || |          z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   rZ   r9   )rO   r/   rp   absminri   logfloatmathrv   where)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r4   _relative_position_bucketz'UdopAttention._relative_position_bucket  s>   ,  	cAK!2Q!6 : :5: F F TT %	*; < <!&+<e>NO`>a>a!b!b b  1$	$y0 &/I'--//);<<h|i/001Y&( "UZ..	&"
 &+Y&8RT_bcTc(d(d&
 &
" 	EK2CE_```r3   c                    || j         j        j        }|,t          j        |t          j        |          dddf         }n|dddf                             |          }t          j        |t          j        |          dddf         }||z
  }|                     || j         | j	        | j
                  }|                      |          }	|	                    g d                              d          }	|	S )z%Compute binned relative position biasNr   r{   r2  r3  r4  )rZ   r   r9   r   )r   r   r{   r/   r>   rp   rO   r9  r  r  r  permutert   )
r   query_length
key_lengthr{   cache_positioncontext_positionmemory_positionr1  relative_position_bucketvaluess
             r4   compute_biaszUdopAttention.compute_bias  s   >18?F!$|L
SYZZZ[\[\[\^b[bc-aaag699&AA,zFSSSTXZ[Z[Z[T[\+.>>#'#A#A#.;=	 $B $
 $
  --.FGG			**44Q77r3   c                    |j         dd         \  }}|du}|                     |          }|                    |d| j        | j                                      dd          }|0|j                            | j                  }|r|j	        }n|j
        }|r|n|}|r)|'|r%|j        | j                 }|j        | j                 }n|                     |          }|                     |          }|                    |d| j        | j                                      dd          }|                    |d| j        | j                                      dd          }|9|s|
nd}
|                    ||| j        d|
i          \  }}|rd|j        | j        <   t!          j        ||                    dd                    }||j         d         }||n
|
d         dz   }| j        s@t!          j        d| j        ||f|j        |j        	          }| j        r| j        rd|_        n3|                     |||j        |

          }|dddd| dddf         }|$|ddddddd|j         d         f         }||z   }| j        rUt!          j        |j         d                   }d|t9          | j                  <   |dd|                                f         }n|}||z  }t<          j                             |!                                d          "                    |          }t<          j        #                    || j#        | j                  }|||z  }t!          j        ||          }|                    dd          $                                }|                    |d| j%                  }| &                    |          }|||f}|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        NrZ   r:   r9   r@  Tr   r{   r   )r{   r@  r   r;   )ptraining)'rL   r   rB   r   r   rA   
is_updatedgetr  cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   updater/   matmulr   zerosr{   r   r!  rJ  requires_gradrE  r   r   listrw   r   
functionalsoftmaxr.  type_asr   
contiguousr  r   )r   r(   maskkey_value_statesposition_biaspast_key_valuelayer_head_maskr>  	use_cacheoutput_attentionsr@  r   
seq_lengthis_cross_attentionquery_statesrK  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr?  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r4   r   zUdopAttention.forward  sA   $ "/!4RaR!8
J .T9vvm,,#((RtG^__iijkmnoo%'266t~FFJ! J&4&J##&4&I#-?R))] 	E."<"<,6t~FJ.:4>JLL//J66.11L#RtG^__iijkmnooJ',,ZT\4KbccmmnoqrssL)7I!St+>+E+Edn?OQ_>`, ,(
L & E@DN-dn= lJ,@,@A,F,FGG #)"-J.:.FllN[]L^abLbO3 
E %j*=fm[a[g! ! ! . 74= 726M/ $ 1 1#ZVd !2 ! ! !.aaaZKLL!!!.C D"111aaa,Bj.>r.B,B#BC - ; 	1:m1!455D,-Dd'(()#0DIIKK#@  #0 && },,V\\^^,DDLLVTT},,\T\TXTa,bb &'/9Ll<>>!++Aq11<<>>!&&z2t~FFff[))> 	0/Gr3   FN)Tr(  r)  NN)	NNNNNNFFN)r+   r,   r-   r   r	   intr   r'  staticmethodr9  rE  r   r   r   s   @r4   r   r   `  s         %*#'	!, !,!, C=	!, !, !, !, !, !,F; ; ;  -  -  -  \- ^   . i i i i i i i ir3   r   c                   H     e Zd Zddee         f fdZ	 	 	 	 	 	 	 ddZ xZS )UdopLayerSelfAttentionFNr  c                     t                                                       t          |||          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )Nr   r  r  )r   r   r   SelfAttentionr   r   r  r  r   r   r   r   r"  s       r4   r   zUdopLayerSelfAttention.__init__F  sl    *0KW`
 
 
 (F<UVVVz&"566r3   c	           
          |                      |          }	|                     |	|||||||          }
||                     |
d                   z   }|f|
dd          z   }|S )N)rZ  r\  r^  r]  r_  r`  r@  r   r9   )r  rw  r   )r   r(   r&   r\  r^  r]  r_  r`  r@  normed_hidden_statesattention_outputrn  s               r4   r   zUdopLayerSelfAttention.forwardN  s      $}==-- '+)/) . 	
 	
 &5Ea5H(I(II "%5abb%99r3   ro  )NNNNFFNr+   r,   r-   r	   rq  r   r   r   r   s   @r4   rt  rt  E  sy        7 7XVY] 7 7 7 7 7 7        r3   rt  c                   J     e Zd Zddee         f fdZ	 	 	 	 	 	 	 	 ddZ xZS )UdopLayerCrossAttentionNr  c                     t                                                       t          |d|          | _        t	          |j        |j                  | _        t          j	        |j
                  | _        d S )NFrv  r  )r   r   r   EncDecAttentionr   r   r  r  r   r   r   r   )r   r   r  r   s      r4   r   z UdopLayerCrossAttention.__init__k  sc    ,VQVbklll'F<UVVVz&"566r3   Fc                     |                      |          }|                     |||||||||	|

  
        }||                     |d                   z   }|f|dd          z   }|S )N)	rZ  r[  r\  r^  r]  r_  r>  r`  r@  r   r9   )r  r  r   )r   r(   r[  r&   r\  r^  r]  r_  r>  r`  r@  ry  rz  layer_outputrn  s                  r4   r   zUdopLayerCrossAttention.forwardq  s      $}==// -'+)%/) 0 
 
 %t||4DQ4G'H'HH/$4QRR$88r3   r  )NNNNFNFNr{  r   s   @r4   r}  r}  j  s{        7 7(3- 7 7 7 7 7 7        r3   r}  c                   R     e Zd Zddee         f fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )		UdopBlockFNr  c                    t                                                       |j        | _        t          j                    | _        | j                            t          |||                     | j        r)| j                            t          ||                     | j                            t          |                     d S )Nrv  )r  )
r   r   r  r   
ModuleListlayerappendrt  r}  r  r"  s       r4   r   zUdopBlock.__init__  s     +]__

"4O[d  	
 	
 	

 ? 	TJ5f	RRRSSS
+f--.....r3   Tc                 .    | j         d         |||||	|
||          }|d d         \  }}	|dd          }|j        t          j        k    rt          j        t          j        |                                          t          j        |j                  j        dz
  t          j        |j                  j                  }t          j	        || |          }| j
        o|d u}|r | j         d         ||||||	|d         dz   |
|	  	        }|d d         \  }}	|j        t          j        k    rt          j        t          j        |                                          t          j        |j                  j        dz
  t          j        |j                  j                  }t          j	        || |          }||dd          z   } | j         d         |          }|j        t          j        k    rt          j        t          j        |                                          t          j        |j                  j        dz
  t          j        |j                  j                  }t          j	        || |          }|f}|
r
||	fz   |z   }n||z   }|S )	Nr   )r&   r\  r^  r]  r_  r`  r@  rZ   i  )r,  maxr9   r:   )r[  r&   r\  r^  r]  r>  r_  r`  )r  r   r/   r   r0  isinfanyfinfor  clampr  )r   r(   r&   r\  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr^  cross_attn_layer_head_maskr]  r_  r`  return_dictr@  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsrn  s                       r4   r   zUdopBlock.forward  s     "/A)'+)/)	"
 	"
 	"
 )?rr(B%~21226 %-//+M**..00M/004t;M/004 K
 "KK<[YYYM!_R1Fd1R 	P&3djm!65; :-+B/!3#"3
' 
' 
'# -DBQB,G)M> "em33#kK..2244K 34484?K 3448 
 !&M|Q\ ] ] ] !24KABB4O O '
2}55 %-//+M**..00M/004t;M/004 K
 "KK<[YYYM " 	2 114EEGG 11Gr3   ro  )NNNNNNNNFFTNr{  r   s   @r4   r  r    s        / /XVY] / / / / / /" "#&*#'T T T T T T T Tr3   r  c                   &     e Zd Zd fd	Zd Z xZS )UdopCellEmbeddings     c                     t          t          |                                            || _        t	          j        ||          | _        t	          j        ||          | _        d S r  )r   r  r   max_2d_position_embeddingsr   r   x_position_embeddingsy_position_embeddings)r   r  r   r   s      r4   r   zUdopCellEmbeddings.__init__  sX     $''00222*D'%'\2Lk%Z%Z"%'\2Lk%Z%Z"""r3   c                    t          j        |dd          }|| j        dz
  z                                  }|                     |d d d d df                   }|                     |d d d d df                   }|                     |d d d d df                   }|                     |d d d d df                   }||z   |z   |z   }|S )Nr[   r8   r9   r   rZ   r   )r/   rn   r  rp   r  r  )r   rl   left_position_embeddingsupper_position_embeddingsright_position_embeddingslower_position_embeddingsr   s          r4   r   zUdopCellEmbeddings.forward  s    z$S))7!;<BBDD#'#=#=d111aaa7m#L#L $($>$>tAAAqqq!G}$M$M!$($>$>tAAAqqq!G}$M$M!$($>$>tAAAqqq!G}$M$M! %'('( (( 	 r3   )r  r  r   r   s   @r4   r  r    sR        [ [ [ [ [ [      r3   r  )g?g      ?c            	       
    e Zd ZdZ	 	 	 	 	 	 	 	 	 d fd		Ze	 	 dd
ee         deee	e
f                  defd            Zdd
ee         deee	e
f                  defdZd Zdd
ee         deee	e
f                  defdZ xZS )r   a  
    Base class of relative biases.

    Args:
        num_heads (`int`):
            Number of attention heads in the model, it will create embeddings of size `num_heads`, which will be added to the scores of each token pair.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            Pair token metric (distance in the sequence, distance in pixels etc.) will be bucketed, parameter is defining number of such
            buckets.
        bidirectional (`bool`, *optional*, defaults to `True`):
            Whether the distance should be bidirectional for a pair of tokens. If `False`, then distance(tok1, tok2) == distance(tok2, tok1).
        scaling_factor (`int`, *optional*, defaults to 1):
            Defining factor which will be used to scale relative distance.
        max_distance (`int`, *optional*, defaults to 128):
            All distances above this value will end up in the one/same bucket.
        augmentation (`bool`, *optional*, defaults to `False`):
            Whether to multiply relative distances by a random scalar.
        expand (`bool`, *optional*, defaults to `False`):
            Whether to expand an existing pretrained model with subsequent additions of prefix_bucket.
    Nr(  Tr9   r)  tokensFc
                 <   t          t          |                                            || _        || _        || _        || _        || _        || _        || _	        |	| _
        || _        |r	| j
        sdnd}
t          j        | j        |
z   | j	                  | _        d S )NrZ   r   )r   r   r   prefix_bucketaugmentationlevelr4  scaling_factorr2  r   expandr  r   r   r   )r   r   r  r2  r  r4  r  r  r  r  
extra_headr   s              r4   r   z!RelativePositionBiasBase.__init__-  s     	&--66888*(
(,*".L+'BBQQ
')|D4WZd4dfjft'u'u$$$r3   r&   rl   returnc                     d S r  r2   )r   r&   rl   s      r4   prepare_inputz&RelativePositionBiasBase.prepare_inputF  s	     	r3   c                 v    |                      ||          }t          || j        | j        | j                  }|S )Nr<  )r  get_relative_position_bucketr2  r  r4  )r   r&   rl   r1  	rp_buckets        r4   
get_bucketz#RelativePositionBiasBase.get_bucketN  sI     ..~tDD8,;*	
 
 
	 r3   c                     |d d d d d f         }|d d d d d f         }||z
  }| j         r| j        r|t          j        t           z  }|| j        z  }|                    t          j                  S r  )	r  rJ  randomuniformAUGMENTATION_RANGEr  rO   r/   rp   )r   	positionsrA  rB  r1  s        r4   get_relative_positionz.RelativePositionBiasBase.get_relative_positionX  s    $QQQ4Z0#AAAtQQQJ/+.>> 	E 	E1C!DDT00 ##EJ///r3   c                    | j         rr| j        rkt          j        | j        dz   | j                  }| j        j        j        |j        j        d | j        <   d|j        j        | j        d <   || _        d| _         | 	                    ||          }| j        r|
                    d          dk    rC|
                    d          dk    r*|                    |
                    d          dd          }|d d d d df         dk     }|                    d          }t          |                                                                          D ]*\  }}| j        ||d ||d f<   | j        dz   |||d d |f<   +|                     |          }	|	                                dk    rt#          d          |	                    g d	          }	|	S )
NrZ   g?Fr   r9   r:   r=   z Wrong dimension of values tensor)r   r   r9   rZ   )r  r  r   r   r  r   r   r   r   r  ru   r@   sum	enumeratecpunumpyr<   r   r=  )
r   r&   rl   new_biasr  	is_prefix
num_prefixidxnum_prefix_rowrD  s
             r4   r   z RelativePositionBiasBase.forwardb  s   ; 	 4- 	 |D$G!$KT^\\HJNJfJmJrHO !F4#F!FGJMHO !D!F!FG+3D(DKOOND99	 	k~~a  A%%.*=*=a*@*@1*D*D%,,^-@-@-C-CQJJ	QQQ1W)I"r**J'01A1A1G1G1I1I'J'J k k#^CGCf	#?@CGCfijCj	#~?@@55i@@::<<1?@@@--r3   )	Nr(  Tr9   r)  r  FFFrp  )r+   r,   r-   r.   r   r   r	   r   r   strr   r  r  r  r   r   r   s   @r4   r   r     s^        . ')v v v v v v2  ,0)-  ( tCH~& 
	   ^ &)9 QUVY[^V^Q_H` lr    0 0 0 hv&6 XdSVX[S[nE] io        r3   r   c                   ^     e Zd Zd	 fd	Zd
dee         deeeef                  defdZ	 xZ
S )RelativePositionBias1Dr9   r)  c                 @     t                      j        d||d| dS )z
        Reimplementation of T5 relative position bias. Distance between given tokens is their distance in the sequence.
        Parameters are the same as in base class
        r  r4  Nr2   r   r   r   r  r4  kwargsr   s       r4   r   zRelativePositionBias1D.__init__  0    
 	\\\\U[\\\\\r3   Nr&   rl   r  c                     | j         dk    rt          d          |                     t          j        |                    d          t          j        |j                  d d d f                   }|S )Nr9   zNo need to scale 1d featuresr;  )r  r   r  r/   r>   ru   rp   r{   )r   r&   rl   r1  s       r4   r  z$RelativePositionBias1D.prepare_input  sv    !##;<<< 66L,,Q//uz.J_```aeghghghahi
 
 ! r3   )r9   r)  rp  r+   r,   r-   r   r	   r   r   r  r   r  r   r   s   @r4   r  r    s        ] ] ] ] ] ]! !HV,< !8TXY\^aYaTbKc !ou ! ! ! ! ! ! ! !r3   r  c                   ^     e Zd Zd fd	Zd	dee         deeeef                  defdZ	 xZ
S )
RelativePositionBiasHorizontald   c                 @     t                      j        d||d| dS )z
        Represents in the bucket embeddings horizontal distance between two tokens. Parameters are the same as in base
        class
        r  Nr2   r  r  s       r4   r   z'RelativePositionBiasHorizontal.__init__  r  r3   Nr&   rl   r  c                     | j         dk    st          d          |t          d          |d d d d ddgf                             d          }|                     |          S )Nr8   ENeed to scale the values of bboxes, as there are in small (0,1) rangez6Bbox is required for horizontal relative position biasr   rZ   r:   r;   r  r   rr   r  )r   r&   rl   horizontal_positions       r4   r  z,RelativePositionBiasHorizontal.prepare_input  sv    "S((deee<UVVV&*111aaa!Q<&8&=&="&=&E&E))*=>>>r3   r  r  rp  r  r   s   @r4   r  r    s        ] ] ] ] ] ]? ?HV,< ?8TXY\^aYaTbKc ?ou ? ? ? ? ? ? ? ?r3   r  c                   ^     e Zd Zd fd	Zd	dee         deeeef                  defdZ	 xZ
S )
RelativePositionBiasVerticalr  c                 @     t                      j        d||d| dS )z
        Represents in the bucket embeddings vertical distance between two tokens. Parameters are the same as in base
        class
        r  Nr2   r  r  s       r4   r   z%RelativePositionBiasVertical.__init__  r  r3   Nr&   rl   r  c                     | j         dk    st          d          |t          d          |d d d d ddgf                             d          }|                     |          S )Nr8   r  z4Bbox is required for vertical relative position biasr9   r   r:   r;   r  )r   r&   rl   vertical_positions       r4   r  z*RelativePositionBiasVertical.prepare_input  sv    "S((deee<STTT$(AAA1v$6$;$;$;$C$C))*;<<<r3   r  rp  r  r   s   @r4   r  r    s        ] ] ] ] ] ]= =HV,< =8TXY\^aYaTbKc =ou = = = = = = = =r3   r  c            	            e Zd Zdee         f fdZ	 ddee         deee	e
f                  deeef         fdZ xZS )	RelativePositionBiasAggregatedmodulesc                 z    t                                                       t          j        |          | _        dS )z
        Class which sums up various computed biases.

        Args:
            modules (Sequence[RelativePositionBiasBase]):
                List of relative bias modules.
        N)r   r   r   r  biases)r   r  r   s     r4   r   z'RelativePositionBiasAggregated.__init__  s0     	mG,,r3   Nr&   rl   r  c                 <    d}| j         D ]} |||          |z   }|S )Nr[   )r  )r   r&   rl   outputr   s        r4   r   z&RelativePositionBiasAggregated.forward  s7     K 	9 	9DT.$//&8FFr3   rp  )r+   r,   r-   r
   r   r   r	   r   r   r  r   r   r.  r   r   r   s   @r4   r  r    s        	-)A B 	- 	- 	- 	- 	- 	- Y] &v.=Ed3PS8n=U	uf}	       r3   r  )1d
horizontalverticalr   r  c                 V   g }t          | d          r| j        D ]}t          |          }|                    d          }t          | d          r| j        n| j        }d|v r|d         |k    rt          d          n||d<   |                    t          |         di |           |S )z
    Creates empty list or one/multiple relative biases.

    :param config: Model's configuration :return: Sequence with created bias modules.
    relative_bias_argstyper   z4Number of heads must match num of heads in the modelr2   )	r   r  r   popr   num_attention_headsr   r  BIAS_CLASSES)r   	bias_listbias_kwargs_orgbias_kwargs	bias_typemodel_num_headss         r4   create_relative_biasr    s     Iv+,, 
E%8 		E 		EO"?33K#//I29&+2N2Nnf..TZTnOk)){+>>$%[\\\ ? ,;K(\)4CC{CCDDDDr3   c                       e Zd ZdZd fd	Zd Zededefd            Z	d Z
d	 Zd
 Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZdej        dej        dej        dedef
dZedej        dededej        dej        dej        defd            Z xZS )	UdopStackz
    This class is based on `T5Stack`, but modified to take into account the image modality as well as 2D position
    embeddings.
    Nc                    t                                                     || _        || _        j        | _        j        | _        j        | _        t          j	        fdt          | j                  D                       | _        t          j        j                  | _        t          j        j                  | _        | j        st'          j        j                  | _        |                               | _        d S )Nc           	      V    g | ]%}t          t          |d k              |          &S )r   rv  )r  rw   )r]   r^   r   s     r4   ra   z&UdopStack.__init__.<locals>.<listcomp>  s4    vvvZ[Yv4Q<<STUUUvvvr3   r  )r   r   embed_tokensembed_patchesr  
max_length_max_length
num_layersr   r  rz   blockr   r   r  final_layer_normr   r   r   r  r  r   cell_2d_embedding_get_relative_biasrelative_bias)r   r   r  r  r   s    `  r4   r   zUdopStack.__init__  s       (* +!, +]vvvv_deiet_u_uvvv
 

 !.fn&B[ \ \ \z&"566 	o%78Y[a[m%n%nD" "44V<<r3   c                     | j         j        D ]R}t          |t                    r;|                     |j        | j        d         j        d         j        j                   Sd S )Nr   )	r  r  rK   r  _tie_or_clone_weightsr   r  r  rw  )r   r   s     r4   _tie_weightszUdopStack._tie_weights  si    &- 	 	D$ 677 **0$*Q-2Ea2H2V2n  	 	r3   r   r  c                 >    t          |           }t          |          S r  )r  r  )r   relative_bias_lists     r4   r   zUdopStack._get_relative_bias  s    1&99-.@AAAr3   c                     | j         S r  r  r   s    r4   get_input_embeddingszUdopStack.get_input_embeddings        r3   c                     | j         S r  r  r	  s    r4   get_output_embeddingszUdopStack.get_output_embeddings  r  r3   c                     || _         d S r  r  r   new_embeddingss     r4   set_input_embeddingszUdopStack.set_input_embeddings  s    *r3   c                    ||n| j         j        }||n| j         j        }||n| j         j        }||n| j         j        }|#|!| j        rdnd}t          d| d| d          |Jt          j        |          dk    r2|	                                }|
                    d|d                   }n!||t          j        |          dk    rt          j        d| j         j        |j        |j        	          }t          j        d|j        |j        	          }t          j        d
|j        |j        	          }|	                                }t          j        |                     ||                    }
t$                              d           n@||	                                d d         }n!| j        rdnd}t          d| d| d          |+| j        t          d          |                     |          }||                     |          }	|	d| j         j        | j         j        z  }t1          |	|||||d| j         j        | j         j        	  	        \  }}}|	                                d d         }| j        s|||                     |          z  }|\  }}|du r$| j        sJ d                    |                       d}d}| j        r|s|t7          |t8                    r4t7          |t:                    sd}t;          |t=                                }nzt7          |t:                    s1d}t$                              d           t;          j         |          }n4|(t;          t=                      t=                                }n	| j        sd }||!                                nd}|t          j"        |||z   |j                  }|/tG                      s!||z   }t          j$        |||j                  }| j         j        r#| %                    |||||j&        nd |          }nO|d d d d d d f         }|'                    |j                  }d|z
  t          j(        |j                  j)        z  }| j        r|| *                    |          }nd }| +                    || j,                  }|rdnd }|rdnd }|r	| j        rdnd } | j        rd }
n| -                    ||          }
|
|z   }
d }!|}"| .                    |"          }"t_          | j0                  D ]\  }#}$|r||"fz   } |$|"||
|||!||#         ||||          }%|du r|%d d         dz   |%dd          z   }%|%d d         \  }"}&|%d         }
| j        r||%|rdnd         }!|r||%d         fz   }| j        r| |%d         fz   } | 1                    |"          }"| .                    |"          }"|r||"fz   }|r|&nd }'|r|j&        }'|r|2                                }'|stg          d  |"||'||| fD                       S ti          |"||'||| !          S )"Ndecoder_ zYou cannot specify both zinputs and zinputs_embeds at the same timer   r:   )r=   r  rH  )r=   r  r=   zEmpty batchzYou have to specify either z
inputs or r|   z;You have to intialize the model with valid token embeddingsTz@`use_cache` can only be set to `True` if {} is used as a decoderFzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r{   )r   r8   r2   )r&   rl   )
r&   r\  r  r  r  r^  r]  r_  r`  r@  r9   r  rZ   r=   r      c              3      K   | ]}||V  	d S r  r2   )r]   r   s     r4   	<genexpr>z$UdopStack.forward.<locals>.<genexpr>  s4         =  !=== r3   )r%   r&   r'   r(   r)   r*   )5r   r_  r`  output_hidden_statesuse_return_dictr  r   r/   numelru   rB   fullr   r{   r   rS  ri   get_extended_attention_maskr  warningr  r  rC   rD   r   r  formatrK   r   r   r   r  from_legacy_cacheget_seq_lengthr>   r!   r   _update_causal_maskrN  rO   r  r,  invert_attention_maskget_head_maskr  r  r   r  r  r  to_legacy_cachetupler$   )(r   r   r&   rl   r  r  r|   r   rc   r_   r\  	head_maskcross_attn_head_maskr'   r_  r`  r  r  r@  err_msg_prefixinput_shaper}   r   ra  return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthrj  encoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr  r(   r^   layer_modulelayer_outputsnext_decoder_cache
next_caches(                                           r4   r   zUdopStack.forward  s   * "+!6IIDK<Q	1B1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B]  ]%>+/?BZZNt>ttnttt   "u{9'='='A'A#..**K!r;r?;;II"y'<YAWAW[\A\A\
9dk.FyO_gpgvwwwI"[9;KS\SbcccN;|I4DIO\\\D#..**K!,T-M-Mn^i-j-jkkMNN=))))&',,..ss3KK+/?BZZNr>rrUcrrrsss  ( !^___ --i88M##11,??'+0DK4JJK2O &&
3 
3/M4 (,,..ss3K 	:4#3T33D999M!,
J?ss$f$m$mnr$s$ssss $&+#? 	#	 	#_-H/511 V*_Vi:j:j V.2+"5o|~~"V"V1DEE 	V&*###`  
 #6"G"X"X ("5lnnlnn"U"U 	# #OETE`!?!?!A!A!Afg!"\&(>(KTaTh  N !*B*D*D!4zAO"Z
OML`aaaN;! 	U228G8S44Y]! KK )D$)9:K%..}/B.CCK,M<O0P0P0TTK? 	35A.2.H.HI_.`.`++.2+ &&y$/BB	"6@BBD0:d&7VDOVrrRV? 	8 MM ..nSW.XXM)K7M(,%%]33(44 "	V "	VOA|# I$58H$H!(L*+&;'F.K )!.#"3-  M E!! -bqb 1G ;mABB>O O0=bqb0A-M- *!,M ]#8#D0=CT>[aaZ[0\-  V!/=3C2E!E? V+?=QRCSBU+U(--m<<]33   	E 1]4D D+4>''$
& 	>(=J 	;(88::J 	   ""%"(      0+)&+%1
 
 
 	
r3   r&   input_tensorr@  r'   r`  c           
         | j         j        dk    r
|d|v r|S d S ||                                nd}t          |t                    }| j         j        dk    r#|s!|st          j        |||| j                  rd S |j        |j	        }	}|j
        d         }
|r|                                }n/t          |t          j                  r|j
        d         n||
z   dz   }|                     ||
|||	||j
        d                   }| j         j        dk    rB|@|j	        j        d	k    r0|s.t          j        |          j        }t          j        ||          }|S )
Nflash_attention_2r[   r   sdpa)r|   r-  is_trainingr9   r:   )r~   target_lengthr   r{   r@  r   cuda)r   _attn_implementationr!  rK   r   r   _ignore_causal_mask_sdparJ  r   r{   rL   get_max_cache_shaper/   r   5_prepare_4d_causal_attention_mask_with_cache_positionr  r  r,  _unmask_unattended)r   r&   r7  r@  r'   r`  past_seen_tokensusing_static_cacher   r{   r~   r<  rj  	min_dtypes                 r4   r"  zUdopStack._update_causal_mask  s    ;+/BBB)c^.C.C%%4
 @O?Z?99;;;`a'EE ;+v55>P5Yj5%>*'7 M	    t$*L,?v&,Q/ 	+??AAMM nel;;<$R((%7!;  PP+')#)!, Q 
 
 K,66*%*f44% 5 E**.I0CKQZ[[Kr3   r~   r<  r   r{   r   c                    | |                                  dk    r| }n+t          j        |          j        }	t          j        ||f|	||          }|dk    rt          j        |d          }|t          j        ||          |                    dd          k    z  }|ddddddf                             |ddd          }| |	                                }| j
        d         }
|ddddddd|
f         | ddddddf         z   }|dk    }|ddddddd|
f                             ||	          |ddddddd|
f<   |S )	a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            device (`torch.device`):
                The device to plcae the 4D attention mask on.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr=   )
fill_valuer   r{   r9   )diagonalr  r:   r   )r<   r/   r  r,  r  triur>   reshaper  r   rL   masked_fill)r&   r~   r<  r   r{   r@  r   r  rj  rE  mask_lengthpadding_masks               r4   rA  z?UdopStack._prepare_4d_causal_attention_mask_with_cache_position=  s   D %.*<*<*>*>!*C*C(KKE**.I* -0Ye\b  K !###jqAAA5<fEEEH^H^_acdHeHeeeK%dD!!!QQQ&67>>z1bRTUUK))//11,226*111aaaL[L+@ANSTSTSTVZ\`bcbcbcScDdd+q05@AAAqqq,;,AV5W5c5c )6 6AAAqqq!!!\k\12 r3   rp  )NNNNNNNNNNNNNNNNNN)r+   r,   r-   r.   r   r  rr  r   r  r   r
  r  r  r   r/   r   r   rw   r"  rq  r   r{   rA  r   r   s   @r4   r  r    s        
= = = = = =,   B: B2P B B B \B! ! !! ! !+ + +
 "#!!'^
 ^
 ^
 ^
B?? l? 	?
 ?  ? ? ? ?B 555 5 {	5
 5 5 5 5 5 \5 5 5 5 5r3   r  zhThe bare UDOP encoder-decoder Transformer outputting raw hidden-states without any specific head on top.c            +           e Zd Zg dZ fdZd Zd Zd Zd Z e	e
           eee          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
ededeeef         dee         deeef         dee         dee         dee         dee         dee         dee         dee         dee         dee         dee         dee         dee         deej                 deedf         f&d                        Z xZS ) r   )encoder.embed_tokens.weightdecoder.embed_tokens.weight!encoder.embed_patches.proj.weightencoder.embed_patches.proj.bias=encoder.relative_bias.biases.0.relative_attention_bias.weight=decoder.relative_bias.biases.0.relative_attention_bias.weightc                    t          t          |                               |           t          j        |j        |j                  | _        t          |          | _	        t          |          }d|_        d|_        d|_        t          || j        | j	                  | _        t          |          }d|_        d|_        |j        |_        t          || j                  | _        |                                  d S )NFT)r   r   r   r   r   
vocab_sizer   r   r   patch_embedr   r  r_  is_encoder_decoderr  encodernum_decoder_layersr  decoder	post_initr   r   encoder_configdecoder_configr   s       r4   r   zUdopModel.__init__  s    i''/// l6#4fnEE.v66!&))$)!#( ,1) d>NOO!&))$(!,1)$*$=! == 	r3   c                     | j         S r  r   r	  s    r4   r
  zUdopModel.get_input_embeddings  
    {r3   c                 |    || _         | j                            |           | j                            |           d S r  r   rY  r  r[  r  s     r4   r  zUdopModel.set_input_embeddings  ;    $)).999)).99999r3   c                     | j         S r  rY  r	  s    r4   get_encoderzUdopModel.get_encoder  
    |r3   c                     | j         S r  r[  r	  s    r4   get_decoderzUdopModel.get_decoder  ri  r3   output_typer   NTr   r&   rl   r   rc   decoder_input_idsdecoder_attention_maskr|   encoder_outputsr'   r'  decoder_inputs_embedsdecoder_head_maskr(  r`  r  r  r@  r  .c                    ||n| j         j        }||n| j         j        }|	|                     ||||||||||
  
        }	|	d         }|r|	j        n|	d         }|                     ||||
|||||||||          }|sQt          d t          |          D                       }t          d t          |	          D                       }	||	z   S t          |j	        |j
        |j        |j        |j        |	j	        |	j        |	j                  S )	a  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> from datasets import load_dataset
        >>> import torch

        >>> # load model and processor
        >>> # in this case, we already have performed OCR ourselves
        >>> # so we initialize the processor with `apply_ocr=False`
        >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/udop-large")

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> inputs = processor(image, words, boxes=boxes, return_tensors="pt")

        >>> decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]])

        >>> # forward pass
        >>> outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 1, 1024]
        ```N)
r   r&   rl   r   rc   r|   r'  r`  r  r  r   r9   r   r&   r|   r'   r  r  r'  r(  r_  r`  r  r  r@  c              3   ,   K   | ]\  }}|d k    |V  dS r9   Nr2   r]   r  values      r4   r  z$UdopModel.forward.<locals>.<genexpr>
  2      #c#cjc5Z]abZbZbEZbZbZbZb#c#cr3   c              3   ,   K   | ]\  }}|d k    |V  dS rw  r2   rx  s      r4   r  z$UdopModel.forward.<locals>.<genexpr>  rz  r3   )r%   r'   decoder_hidden_statesdecoder_attentionsr*   encoder_last_hidden_stater  encoder_attentions)r   r_  r  rY  r&   r[  r&  r  r   r%   r'   r(   r)   r*   )r   r   r&   rl   r   rc   ro  rp  r|   rq  r'   r'  rr  rs  r(  r_  r`  r  r  r@  r(   r  decoder_outputss                          r4   r   zUdopModel.forward  st   r "+!6IIDK<Q	%0%<kk$+B] ""ll#-)'+#"3%9' +  O (*CN!f!?!?TcdeTf ,,'1/+"/#9'!5/!5#) ' 
 
   	5##c#cIo<V<V#c#c#cccO##c#cIo<V<V#c#c#cccO"_44!-?+;"1"?.9,=&5&G"1"?.9	
 	
 	
 		
r3   )NNNNNNNNNNNNNNTNNNN)r+   r,   r-   _tied_weights_keysr   r
  r  rh  rl  r    UDOP_INPUTS_DOCSTRINGr"   r   _CONFIG_FOR_DOCr   r   r  r   r	   rw   r/   
LongTensorr   r   r   r   s   @r4   r   r   w  sH       
      ,  : : :
     +*+@AA+=O\\\ !!%#)-&*.237*.,0,0&*26.215,0/3&*59)l
 l
l
 l
 38n	l

 v&l
 #s(^l
 $F+l
 !) 0l
  'l
 "&)l
 "&)l
 F#l
  (/l
 $F+l
 'v.l
" $D>#l
$ 'tn%l
& d^'l
( !!12)l
* 
vs{	+l
 l
 l
 ]\ BAl
 l
 l
 l
 l
r3   r   a  The UDOP encoder-decoder Transformer with a language modeling head on top, enabling to generate text given document
    images and an optional prompt.

    This class is based on [`T5ForConditionalGeneration`], extended to deal with images and layout (2D) data.c            -           e Zd Zg dZ fdZd Zd Zd Zd Zd Z	d Z
 ee           eee	          	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dededeeef         dee         deeef         dee         dee         dee         dee         dee         dee         dee         dee         dee         dee         dee         dee         dee         deej                 deed f         f(d!                        Zd" Z xZS )$r   )rO  rP  rQ  rR  rS  rT  zlm_head.weightc                 J   t          t          |                               |           t          j        |j        |j                  | _        t          |          | _	        t          |          }d|_        d|_        d|_        t          || j        | j	                  | _        t          |          }d|_        d|_        |j        |_        t          || j                  | _        t          j        |j        |j        d          | _        |                                  d S )NFTr   )r   r   r   r   r   rV  r   r   r   rW  r   r  r_  rX  r  rY  rZ  r  r[  r   r   r\  r]  s       r4   r   z%UdopForConditionalGeneration.__init__,  s    *D11::6BBB l6#4fnEE.v66!&))$)!#( ,1) d>NOO!&))$(!,1)$*$=! == y1BOOO 	r3   c                     | j         S r  ra  r	  s    r4   r
  z1UdopForConditionalGeneration.get_input_embeddingsE  rb  r3   c                 |    || _         | j                            |           | j                            |           d S r  rd  r  s     r4   r  z1UdopForConditionalGeneration.set_input_embeddingsH  re  r3   c                     || _         d S r  r   r  s     r4   set_output_embeddingsz2UdopForConditionalGeneration.set_output_embeddingsM  s    %r3   c                     | j         S r  r  r	  s    r4   r  z2UdopForConditionalGeneration.get_output_embeddingsP  ri  r3   c                     | j         S r  rg  r	  s    r4   rh  z(UdopForConditionalGeneration.get_encoderS  ri  r3   c                     | j         S r  rk  r	  s    r4   rl  z(UdopForConditionalGeneration.get_decoderV  ri  r3   rm  NTr   r&   rl   r   rc   ro  rp  r|   rq  r'   r'  rr  rs  r(  r`  r  r  labelsr@  r  .c                 *   ||n| j         j        }||n| j         j        }|||                     |          }|	|                     ||||||||||
  
        }	|	d         }|r|	j        n|	d         }|                     ||||
|||||||||          }|d         }| j         j        r|| j         j        dz  z  }| 	                    |          }d}|Vt          d          } ||                    d	|                    d	                    |                    d	                    }|s-|f|d
d         z   |	d         fz   |	d
d         z   }||f|z   n|S t          |||j        |j        |j        |j        |	j        |	j        |	j        	  	        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
            1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size]`.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, UdopForConditionalGeneration
        >>> from datasets import load_dataset

        >>> # load model and processor
        >>> # in this case, we already have performed OCR ourselves
        >>> # so we initialize the processor with `apply_ocr=False`
        >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
        >>> model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large")

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> # one can use the various task prefixes (prompts) used during pre-training
        >>> # e.g. the task prefix for DocVQA is "Question answering. "
        >>> question = "Question answering. What is the date on the form?"
        >>> encoding = processor(image, question, text_pair=words, boxes=boxes, return_tensors="pt")

        >>> # autoregressive generation
        >>> predicted_ids = model.generate(**encoding)
        >>> print(processor.batch_decode(predicted_ids, skip_special_tokens=True)[0])
        9/30/92
        ```N
r   rl   rc   r   r&   r|   r'  r`  r  r  r   r9   ru  r   r   )ignore_indexr:   rZ   )	losslogitsr'   r|  r}  r*   r~  r  r  )r   r_  r  r   rY  r&   r[  r   r   r   r   rB   ru   r   r'   r(   r)   r*   r%   )r   r   r&   rl   r   rc   ro  rp  r|   rq  r'   r'  rr  rs  r(  r_  r`  r  r  r  r@  r(   r  r  sequence_output	lm_logitsr  loss_fctr  s                                r4   r   z$UdopForConditionalGeneration.forwardY  s   @ "+!6IIDK<Q	%0%<kk$+B]$); $ 1 1& 9 9 ""ll#')-+#"3%9' +  O (*CN!f!?!?TcdeTf ,,'1/+"/#9'!5/!5#) ' 
 
  *!,;* 	L .1Dd1JKOLL11	'T:::H8INN2y~~b/A/ABBFKKPROOTTD 	F\OABB$77?1;M:OORabcbdbdReeF)-)9TGf$$vE+;"1"?.9,=&5&G"1"?.9

 

 

 
	
r3   c           	         |t                               d           |S d}|D ]}d}|D ]4}||                    d|                    |j                            fz   }5|d         j        |d         j        k    r,t          d|d         j         d|d         j         d          t          |          t          |          k    r0t          dt          |           dt          |           d          ||fz   }|S )	NzHYou might want to consider setting `use_cache=True` to speed up decodingr2   r   z%reordered_layer_past_states[0] shape z  and layer_past_states[0] shape z mismatchedz&length of reordered_layer_past_states z! and length of layer_past_states )r  r  index_selectrO   r{   rL   r   rM   )r   r'   beam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r4   _reorder_cachez+UdopForConditionalGeneration._reorder_cache  s    "NNefff""!#!0 	] 	] +-'$5   .I$11!X[[AQAX5Y5YZZM /++ +1-37H7K7QQQ  j<WXY<Z<`  j  j  CT  UV  CW  C]  j  j  j   .//37H3I3III  dSA\=]=]  d  d  AD  EV  AW  AW  d  d  d   &<?Z>\%\""%%r3   )NNNNNNNNNNNNNNTNNNNN)r+   r,   r-   r  r   r
  r  r  r  rh  rl  r    r  r"   r   r  r   r   r  r   r	   rw   r/   r  r   r   r  r   r   s   @r4   r   r     s             2  : : :
& & &       +*+@AA?YYY !!%#)-&*.237*.,0,0&*26.215,0/3&*#'59+C
 C
C
 C
 38n	C

 v&C
 #s(^C
 $F+C
 !) 0C
  'C
 "&)C
 "&)C
 F#C
  (/C
 $F+C
 'v.C
" $D>#C
$ 'tn%C
& d^'C
(  )C
* !!12+C
, 
vs{	-C
 C
 C
 ZY BAC
L& & & & & & &r3   r   zhThe bare UDOP Model transformer outputting encoder's raw hidden-states without any specific head on top.c                   d    e Zd Zg dZdef fdZd Zd Zd Zd Z	 e
e           eee          	 	 	 	 	 	 	 	 	 	 dd
edeeef         dedee         deeef         dee         dee         dee         dee         dee         deeej                 ef         fd                        Z xZS )UdopEncoderModel)rO  rQ  rR  rS  r   c                 j   t                                          |           t          j        |j        |j                  | _        t          |          | _        t          |          }d|_
        d|_        d|_        t          || j        | j                  | _        |                                  d S )NF)r   r   r   r   rV  r   r   r   rW  r   r  r_  rX  r  rY  r\  )r   r   r^  r   s      r4   r   zUdopEncoderModel.__init__  s        l6#4fnEE.v66!&))$)!#( ,1) d>NOO 	r3   c                     | j         S r  ra  r	  s    r4   r
  z%UdopEncoderModel.get_input_embeddings  rb  r3   c                 H    || _         | j                            |           d S r  )r   rY  r  r  s     r4   r  z%UdopEncoderModel.set_input_embeddings  s%    $)).99999r3   c                     | j         S r  rg  r	  s    r4   rh  zUdopEncoderModel.get_encoder#  ri  r3   c                     |                                 D ]:\  }}| j        j        |         j        d         j                            |           ;dS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        r   N)itemsrY  r  r  rw  r'  )r   heads_to_pruner  r%  s       r4   _prune_headszUdopEncoderModel._prune_heads&  s]    
 +0022 	P 	PLE5Lu%+A.<HHOOOO	P 	Pr3   rm  Nr   rl   r&   r   rc   r'  r|   r`  r  r  r  c                     ||n| j         j        }|	|	n| j         j        }	|
|
n| j         j        }
|                     |||||||||	|

  
        }|S )aw  
        Returns:

        Example:

        ```python
        >>> from transformers import AutoProcessor, UdopEncoderModel
        >>> from huggingface_hub import hf_hub_download
        >>> from datasets import load_dataset

        >>> # load model and processor
        >>> # in this case, we already have performed OCR ourselves
        >>> # so we initialize the processor with `apply_ocr=False`
        >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
        >>> model = UdopEncoderModel.from_pretrained("microsoft/udop-large")

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )r   r`  r  r  rY  )r   r   rl   r&   r   rc   r'  r|   r`  r  r  rq  s               r4   r   zUdopEncoderModel.forward.  s    V 2C1N--TXT_Tq$8$D  $+Jj 	 &1%<kk$+B],,#%)'/!5# ' 
 
 r3   )
NNNNNNNNNN)r+   r,   r-   r  r   r   r
  r  rh  r  r    UDOP_ENCODER_INPUTS_DOCSTRINGr"   r$   r  r   r   r  r   r	   rw   r   r   r/   r0   r   r   r   s   @r4   r  r     s       
  z         : : :  P P P +*+HII+KZijjj !#!%)-&*&**.,0/3&*< << 38n< 	<
 v&< #s(^< F#<  '< $D>< 'tn< d^< 
uU&')II	J< < < kj JI< < < < <r3   r  )r5   r6   )r   )NrX   r   r5   r6   )Xr.   r   loggingr/  r  r   r   r   copyr   dataclassesr   typingr   r   r	   r
   r   r   r/   r   r   torch.nnr   transformersr   transformers.modeling_outputsr   r   activationsr   cache_utilsr   r   r   r   
generationr   modeling_attn_mask_utilsr   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r    r!   r"   	getLoggerr+   r  r  UDOP_START_DOCSTRINGr  r  r$   rI   rW   r   Moduler   r   r   r   r   r  r   rt  r}  r  r  r9  r  r  r   r  r  r  r  r  r  r  r   r   r  r2   r3   r4   <module>r     s            # # # # # # # #       ! ! ! ! ! ! > > > > > > > > > > > > > > > >          % % % % % % # # # # # #       
 " ! ! ! ! ! P P P P P P P P P P P P ) ) ) ) ) ) > > > > > > - - - - - - Q Q Q Q Q Q Q Q              
	8	$	$   Z z5! p '@ '@ '@ '@ '@{ '@ '@ '@T   .
 
 
 
$ M/ M/ M/ M/`    ")   <_! _! _! _! _!/ _! _! _!F+ + + + +BI + + +4    	   .    RY   <    ")   &a a a a aBI a a aJ! ! ! ! !RY ! ! !J# # # # #bi # # #Nc c c c c	 c c cL       8  -F ! e e e e ery# e e eP! ! ! ! !5 ! ! !$? ? ? ? ?%= ? ? ?&= = = = =#; = = =&    RY   . !0,  9Q0R    ,J J J J J# J J JZ n \
 \
 \
 \
 \
# \
 \
	 \
~ q  \& \& \& \& \&#6 \& \& \&~ n h h h h h* h h	 h h hr3   