
    g6                         d dl Z d dlZd dlmZmZ d dlZd dlmc mZ	 ddl
mZmZ  e            r2d dlmZmZmZ d dlmZmZ d e e j        e          j                  v Zdej        d	eej        ej        ef         fd
Zdej        dej        dej        dej        def
dZd Z	 	 	 	 	 	 	 ddej        dej        dej        dej        dedededeej                 dee         dee         dedee         defdZdS )     N)OptionalTuple   )is_flash_attn_2_availableis_flash_attn_greater_or_equal)index_first_axis	pad_inputunpad_input)flash_attn_funcflash_attn_varlen_funcwindow_sizeattention_maskreturnc                 v   |                      dt          j                  }t          j        |                                 d                                          }|                                                                }t          j        t          j	        |dt          j                  d          }|||fS )aq  
    Retrieves indexing data required to repad unpadded (ragged) tensors.

    Arguments:
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

    Return:
        indices (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input sequence.
        cu_seqlens (`torch.Tensor`):
            The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        max_seqlen_in_batch (`int`):
            Maximum sequence length in batch.
    )dimdtypeF)as_tupler   )r   r   )
sumtorchint32nonzeroflattenmaxitemFpadcumsum)r   seqlens_in_batchindicesmax_seqlen_in_batch
cu_seqlenss        g/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/modeling_flash_attention_utils.py_get_unpad_datar$   !   s      &))b)DDmN2244uEEEMMOOG*..005577u|$4!5;OOOQWXXJ     query_layer	key_layervalue_layerquery_lengthc                 L   t          |          \  }}}|j        \  }}	}
}t          |                    ||	z  |
|          |          }t          |                    ||	z  |
|          |          }||	k    r/t          |                     ||	z  d|          |          } |}|}|}nu|dk    rKd}t	          j        |dz   t          j        | j                  }|dd         }|                     d          } n$|dd| df         }t          | |          \  } }}}| |||||f||ffS )a  
    Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong to different batches.

    This function is used instead of `flash_attn.bert_padding.unpad_input` in order to avoid the recomputation of the same intermediary
    tensors for query, key, value tensors.

    Arguments:
        query_layer (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        attention_mask (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
        query_length (`int`):
            Target length.

    Return:
        query_layer (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key_layer (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value_layer (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   )r   deviceN)
r$   shaper   reshaper   aranger   r+   squeezer
   )r&   r'   r(   r   r)   	indices_kcu_seqlens_kmax_seqlen_in_batch_k
batch_size
kv_seq_lennum_key_value_headshead_dimcu_seqlens_qmax_seqlen_in_batch_q	indices_qs                  r#   _upad_inputr:   <   s   N 6E^5T5T2I|2<EO9J
/ !2!2:
3JL_ai!j!jluvvI"J35H(SSU^ K z!!&{':'::
;RTVX`'a'aclmm# 5					 !|N%+k6H
 
 
 !"%	!))!,, (L=>>(9:FQR]_mFnFnCY.C 		|$	 56 r%   c                    |                      d|                     d          |                     d                    } |                     d|                    d          |                    d                    }|                     d|                    d          |                    d                    }|                                }t          j        |                    d          |j        t          j                  }t          j        ||dk             t          j        |                                |j        t          j                  f          }|	                                dz   }| |||||f||ffS )aK  
    This function returns necessary arguments to call `flash_attn_varlen_func`.
    All three query, key, value states will be flattened.
    Cummulative lengths of each examples in the batch will be extracted from position_ids.

    NOTE: ideally cummulative lengths should be prepared at the data collator stage

    Arguments:
        query (`torch.Tensor`):
            Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
        position_ids (`torch.Tensor`):
            Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

    Return:
        query (`torch.Tensor`):
            Query state without padding. Shape: (total_target_length, num_heads, head_dim).
        key (`torch.Tensor`):
            Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        value (`torch.Tensor`):
            Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
        indices_q (`torch.Tensor`):
            The indices of non-masked tokens from the flattened input target sequence.
        (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
            The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
        (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
            Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
    r   r   )r+   r   r   )
viewsizer   r   r.   r+   r   cattensorr   )querykeyvalueposition_idsr9   cu_seq_lens
max_lengths          r#   prepare_fa2_from_position_idsrG      s<   @ JJr5::b>>5::b>>::E
((2sxx||SXXb\\
2
2CJJr5::b>>5::b>>::E''))L\..q11,:MUZU`aaaI)la'(L**,,\5HPUP[\\\	
 K !!##a'J3y;*DzS]F^__r%           Fquery_states
key_statesvalue_states	is_causaldropoutrD   softmax_scalesliding_windowuse_top_left_masksoftcapdeterministicc                 R   |
s|}n|o|dk    }t           o|	duo|j        d         |	k    }|rd|	|	fini }t          d          r+|$t          j                            dd          dk    }||d<   |||d	<   |[| j        d
         }t          | ||||          \  } }}}}}|\  }}|\  }}t          | ||f|||||||d|}t          ||||          }n||dk    rt          j
        |d          d
k                                    s|                     d
          }t          | |||          \  } }}}}}|\  }}|\  }}t          | ||f|||||||d|}|                    |d|                    d          |                    d                    }nt          | |||f||d|}|S )a  
    Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
    first unpad the input, then computes the attention scores and pad the final attention scores.

    Args:
        query_states (`torch.Tensor`):
            Input query states to be passed to Flash Attention API
        key_states (`torch.Tensor`):
            Input key states to be passed to Flash Attention API
        value_states (`torch.Tensor`):
            Input value states to be passed to Flash Attention API
        attention_mask (`torch.Tensor`):
            The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
            position of padding tokens and 1 for the position of non-padding tokens.
        dropout (`float`):
            Attention dropout
        softmax_scale (`float`, *optional*):
            The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
        use_top_left_mask (`bool`, defaults to `False`):
            flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
        softcap (`float`, *optional*):
            Softcap for the attention logits, used e.g. in gemma2.
        deterministic (`bool`, *optional*):
            Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled.
    r   Nr   z2.4.1FLASH_ATTENTION_DETERMINISTIC01rR   rQ   r   )r7   r1   max_seqlen_qmax_seqlen_k	dropout_prN   causalr   )r   r<   )rN   rZ   )_flash_supports_window_sizer,   r   osenvirongetr:   r   r	   r   diffallr>   rG   r=   r   )rI   rJ   rK   r   r)   rL   rM   rD   rN   rO   rP   rQ   rR   rZ   use_sliding_windowsflash_kwargsr3   r9   rE   max_seq_lensr7   r1   r8   r2   attn_output_unpadattn_outputs                             r#   _flash_attention_forwardrf      s   P  1 0|q0 	$kd(BkzGWXYGZ]kGk  I\cMNN#CDDacL%g.. 6 JNN+JCPPTWWM(5_%")Y !!'*
Wb*lNLX
 X
Tj,	; &1"l7C442
 &%..'
 
 
 
   19j,WW 
	!la&7&7L^`AaAaAaefAf@k@k@m@m&7!&&q))
Wt*lLX
 X
Tj,	; &1"l7C44,
 &%..'
 
 
 
 "&&z2{7G7G7K7K[M]M]^`MaMabb &*lG
KXag
 
kw
 
 r%   )rH   NNNFNN) inspectr\   typingr   r   r   torch.nn.functionalnn
functionalr   utilsr   r   flash_attn.bert_paddingr   r	   r
   
flash_attnr   r   list	signature
parametersr[   Tensorintr$   r:   rG   boolfloatrf    r%   r#   <module>rw      s`     				 " " " " " " " "           L L L L L L L L  gPPPPPPPPPPBBBBBBBB"/448I8I/8Z8Z8e3f3f"fEL U5<WZ;Z5[    6FF|F F L	F
 F F F FR/` /` /`r +/%)$(##v v,vv ,v L	v
 v v v 5<(v E?v SMv v e_v v v v v v vr%   