
    gQ                        d dl mZ d dlmZmZmZmZ d dlZddlm	Z	 e G d d                      Z
	 ddeej                 d	eej        eef         d
ej        dedee         f
dZ	 ddeej                 d	eej        eef         d
ej        dedee         f
dZddej        dej        dee         fdZddej        dej        dee         fdZ	 	 dd	eej        eef         dej        dej        dedee         deej                 fdZdS )    )	dataclass)ListOptionalTupleUnionN   )is_torchdynamo_compilingc                      e Zd ZU dZeed<   eed<   d dedee         fdZ	 d!deded	ed
e	j
        dee	j        df         dee	j                 fdZ	 d de	j        ded
e	j
        d	ee         de	j        f
dZe	 	 d"de	j        d
e	j
        de	j        dedee         f
d            Zed de	j        d
e	j
        dee         fd            Zede	j        defd            Ze	 	 d#dee	j                 de	j        dedee         dedefd            ZdS )$AttentionMaskConvertera9  
    A utility attention mask class that allows one to:
        - Create a causal 4d mask
        - Create a causal 4d mask with slided window
        - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
          key_value_length) that can be multiplied with attention scores

    Examples:

    ```python
    >>> import torch
    >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter

    >>> converter = AttentionMaskConverter(True)
    >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
    tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
    ```

    Parameters:
        is_causal (`bool`):
            Whether the attention mask should be a uni-directional (causal) or bi-directional mask.

        sliding_window (`int`, *optional*):
            Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
    	is_causalsliding_windowNc                 z    || _         || _        | j        #| j        dk    rt          d| j         d          d S d S )Nr   zaMake sure that when passing `sliding_window` that its value is a strictly positive integer, not ``)r   r   
ValueError)selfr   r   s      a/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/modeling_attn_mask_utils.py__init__zAttentionMaskConverter.__init__9   si    ",*t/Ba/G/G Ktx  uH  K  K  K   +*/G/G    cpu
batch_sizequery_lengthkey_value_lengthdtypedevicestrreturnc                     | j         st          d| j         d          ||f}||z
  }d}|d         dk    s| j        |                     ||||| j                  }|S )z
        Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
        bias to upper right hand triangular matrix (causal mask).
        z"Please use `to_causal_4d` only if z has `is_causal` set to True.Nr   r   past_key_values_lengthr   )r   r   	__class__r   _make_causal_mask)	r   r   r   r   r   r   input_shaper    causal_4d_masks	            r   to_causal_4dz#AttentionMaskConverter.to_causal_4dB   s     ~ 	qo$.oooppp "<0!1L!@ r?Q$"5"A!33'=#2 4  N r   attention_mask_2dc                    |j         d         |f}d}|d         dk    s| j        B| j        r;|t          d          ||z
  }|                     |||j        || j                  }n| j        t          d          |                     |||d                                       |j                  }|?|	                    |
                                t          j        |          j                  }|}	|	S )	a  
        Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
        key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
        causal, a causal mask will be added.
        r   Nr   r   zpThis attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask.r   z?Sliding window is currently only implemented for causal masking)tgt_len)shaper   r   r   r"   r   NotImplementedError_expand_masktomasked_fillbooltorchfinfomin)
r   r&   r   r   r   r#   r$   r    expanded_attn_maskexpanded_4d_masks
             r   to_4dzAttentionMaskConverter.to_4dc   s5    ).q1<@ Oa4#6#B#B'  G   &6%D"!33(/'=#2 4  NN  ,%&ghhh "../@%Q\]_Q`.aadd$
 
 %!/!;!;<N<S<S<U<UW\WbchWiWiWm!n!n .r   r   input_ids_shaper    c                     | \  }}t          j        ||ft          j        |          j        |          }t          j        |                    d          |          }|                    ||dz                       |                    d          d          k     d           |                    |          }|dk    r.t          j	        t          j
        ||||          |gd          }|i||z
  dz
  }	t          j        t          j        |t           j                  |		          }
|                    |
t          j        |          j                   |ddddddf                             |d|||z             S )
zJ
        Make causal mask used for bi-directional self-attention.
        )r   r   r   r   r   r   )dimN)r   )diagonal)r/   fullr0   r1   arangesizemasked_fill_viewr,   catzerostril	ones_liker.   expand)r5   r   r   r    r   bszr(   mask	mask_condr9   context_masks              r   r"   z(AttentionMaskConverter._make_causal_mask   sm    'Wz7G,ek%.@.@.DVTTTL2v>>>	)y1}&:&:499R==!&L&LLaPPPwwu~~!A%%9ek'3IQV_efffhlmsuvvvD %->BH :eod%*&M&M&MX`aaaLlEK,>,>,BCCCD$111$%,,S!WgH^>^___r   rE   r(   c                 L   |                                  \  }}||n|}| ddddddf                             |d||                              |          }d|z
  }|                    |                    t          j                  t	          j        |          j                  S )zg
        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
        Nr         ?)r<   rC   r,   r-   r/   r.   r0   r1   )rE   r   r(   rD   src_lenexpanded_maskinverted_masks          r   r+   z#AttentionMaskConverter._expand_mask   s    
 yy{{W$0''gQQQdAAA-.55c1gwOORRSXYYm+(()9)9%*)E)Eu{SXGYGYG]^^^r   rK   	min_dtypec                     | j         t          j        k    rt          d          |                     t          j        | |k    dd                     S )a  
        Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
        using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
        Details: https://github.com/pytorch/pytorch/issues/110213

        `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
        `attention_mask` is [bsz, src_seq_len].

        The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.

        For example, if `expanded_mask` is (e.g. here left-padding case)
        ```
        [[[[0, 0, 0],
           [0, 0, 0],
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[0, 0, 0],
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        then the modified `expanded_mask` will be
        ```
        [[[[1, 1, 1],   <-- modified
           [1, 1, 1],   <-- modified
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[1, 1, 1],   <-- modified
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        z\AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor.r   T)r8   keepdim)r   r/   r.   r   mulall)rK   rM   s     r   _unmask_unattendedz)AttentionMaskConverter._unmask_unattended   sZ    T %*,,n     %)MY,FBX\"]"]"]!]^^^r   Fattention_maskinputs_embedsis_trainingc                    |j         d         |j         d         }}||z   }t          j                                        p,t	          |t          j        j                  pt                      }d}	| |s|s|dk    s||k    r
|||k     rd}	nJ|||k     rBt          | j                   dk    rdS |s&t          j	        | dk              r|dk    s||k    rd}	|	S )a9  
        Detects whether the optional user-specified attention_mask & the automatically created causal mask can be
        ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.

        In case no token is masked in the `attention_mask` argument, if `query_length == 1` or
        `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
        allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
        passed).
        r   r   FNT   )
r)   r/   jit
is_tracing
isinstancefxProxyr	   lenrQ   )
rS   rT   r    r   rU   _r   r   rY   ignore_causal_masks
             r   _ignore_causal_mask_sdpaz/AttentionMaskConverter._ignore_causal_mask_sdpa   s   $ (-a0-2Ea2H<'*@@Y))++vz-/X/Xv\t\v\v
"! *$.*!Q&&*:l*J*J#+/?./P/P%)"#'7.'H'H>'((A--u .EIn.A$B$B .1$$(8L(H(H)-& "!r   N)r   r   N)NF)__name__
__module____qualname____doc__r.   __annotations__intr   r   r/   r   r   r   Tensorr%   r4   staticmethodSizer"   r+   FloatTensorfloatrR   r`    r   r   r   r      s         < OOO $      .3   	
 { elE)* 
%,	   L +/-  -  <-  -  {	- 
 #3--  
-  -  -  - ^ 
 '((,` ``{` ` !$	`
 !` ` ` \`: _ _5< _ _hsm _ _ _ \_ ._(._._ ._ ._ \._` 
 )-!7" 7" .7"|7" !$7" !	7"
 7" 
7" 7" 7" \7" 7" 7"r   r   rS   r#   rT   r    r   c                    t          d|          }|d         |z   }| =t          | j                  dk    r%|                    | |d         ||j                  } n| t          | j                  dk    r|d         d	|d	         |f}t          | j                  |k    r(t          d
t          | j                   d| d          d| z
  }|                    |                    t          j
                  t          j        |j                  j                  } n0|                    |d         |d         ||j        |j                  } | S )a  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        attention_mask (`torch.Tensor` or `None`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        inputs_embeds (`torch.Tensor`):
            The embedded inputs as a torch Tensor.
        past_key_values_length (`int`):
            The length of the key value cache.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    Tr   r   r   N   )r   r   rW   r   r   z#Incorrect 4D attention_mask shape: z; expected: .rI   r7   )r   r]   r)   r4   r   tupler   r-   r,   r/   r.   r0   r1   r%   r   )	rS   r#   rT   r    r   attn_mask_converterr   expected_shaperL   s	            r   !_prepare_4d_causal_attention_maskrv   *  sx   . 14P^___"2)?? !c.*>&?&?1&D&D,22KO>NVcVi 3 
 
 
	#N,@(A(AQ(F(F%a.![^=MN%&&.88peN<P6Q6Qpp_mppp  
  .0M*66  ,,ek-:M.N.N.R NN -99NKO-=]EXanau : 
 
 r   c                    t          d|          }|d         |z   }t          j                                        p,t	          |t          j        j                  pt                      }t                               | |||          }|rd}	n| 1|	                    |d         |d         ||j
        |j                  }	n|                                 dk    r| }	n$|                    | |d         |j
        |	          }	|sH|	j        j        d
k    r8t                               |	t          j        |j
                  j                  }	|	S )a  
    Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.

    In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
    `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
    Trp   r   )rS   rT   r    r   Nr   r7   rW   )r   r   cuda)rM   )r   r/   rX   rY   rZ   r[   r\   r	   r`   r%   r   r   r8   r4   typerR   r0   r1   )
rS   r#   rT   r    r   rt   r   rY   r_   r3   s
             r   *_prepare_4d_causal_attention_mask_for_sdparz   _  sz    14P^___"2)??
 %%''r:mUX^+T+TrXpXrXrJ/HH%#5%	 I    		.;;NKO-=]EXanau < 
 
 1$$-288B#)!1	  9      	.5:fDD5HH EK8K,L,L,P  I     r   rE   r   r(   c                 <    t                               | ||          S )  
    Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        mask (`torch.Tensor`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        tgt_len (`int`):
            The target length or query length the created mask shall have.
    rE   r   r(   )r   r+   r}   s      r   _prepare_4d_attention_maskr~     s     "..Dw.WWWr   c                 *   | j         \  }}||n|}t          j                                        p,t	          | t          j        j                  pt                      }|st          j        | dk              rdS t          
                    | ||          S )r|   Nr   r}   )r)   r/   rX   rY   rZ   r[   r\   r	   rQ   r   r+   )rE   r   r(   r^   r   rY   s         r   #_prepare_4d_attention_mask_for_sdpar     s     *A ,gg2BG%%''i:dEHN+K+KiOgOiOiJ  \%)DAI.. \t%22ESZ2[[[r   r   r   c                     t          d|          }|| d         z   }|                    | d         | d         |||          }|S )a/  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`

    Args:
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        device (`int`):
            The torch device the created mask shall have.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    Trp   r   r   r7   )r   r%   )r#   r   r   r    r   rt   r   rS   s           r    _create_4d_causal_attention_maskr     s]    ( 14P^___-B?(55AB)9v 6  N r   ra   rb   )dataclassesr   typingr   r   r   r   r/   utils.import_utilsr	   r   ri   rk   rh   rv   rz   r   r~   r   r   r   rn   r   r   <module>r      sg   " ! ! ! ! ! / / / / / / / / / / / /  8 8 8 8 8 8 P" P" P" P" P" P" P" P"p %)1 1U\*1uz5$./1 <1  	1
 SM1 1 1 1t %)7 7U\*7uz5$./7 <7  	7
 SM7 7 7 7tX XU\ X%+ XPXY\P] X X X X \ \el \5; \YabeYf \ \ \ \: #$$( uz5$./; L  	
 SM el     r   