
    Χg~                     P   d dl Z d dlmZmZ d dlZd dlZd dlmc mZ	 d dl
mZmZmZmZmZmZ d dlmZ ddlmZ  e j        e          Z	 	 	 	 d-d	ej        d
ej        dej        deej                 fdZd.dedefdZd.dedefdZ	 d.dej        dedefdZd.defdZd.dedefdZ d.dedefdZ!d.dedefdZ"d.dedefdZ#d Z$dej        deej        e%e%f         fdZ&dej        fdZ'dej        de%d e%d!e%dej        f
d"Z(d# Z)dej        d$e%d%edej        fd&Z*d' Z+d(ej        fd)Z,d* Z-d	ej        d
ej        dej        deej                 deej        ej        ej        eej                 f         f
d+Z.	 	 	 	 	 d/d	ej        d
ej        dej        deej                 fd,Z/dS )0    N)OptionalTuple)can_use_efficient_attentioncan_use_flash_attentionflash_sdp_enabledmath_sdp_enabledmem_efficient_sdp_enabled
SDPAParams)
SDPBackend   )NestedTensor        Fquerykeyvalue	attn_maskc           	         t          | t                    r*t          |t                    rt          |t                    s(t          d| j         d|j         d|j         d          | j        |j        k    s| j        |j        k    r(t          d| j         d|j         d|j         d          | j        |j        k    s| j        |j        k    r(t          d| j         d	|j         d
|j         d          |                                 dk     s0|                                dk     s|                                dk     rOt          d|                                  d|                                 d|                                 d          | j        |j        k    s| j        |j        k    r(t          d| j         d|j         d|j         d          |t          d          d S )NzNExpected query, key, and value to be nested tensors, but got query.is_nested: z, key.is_nested: z, and value.is_nested: z	 instead.zLExpected query, key, and value to have the same dtype, but got query.dtype: z, key.dtype: z, and value.dtype: zSExpected query, key, and value to have the same device type, but got query.device: z, key.device: z, and value.device:    zUExpected query, key, and value to all be  at least 3 dimensional, but got query.dim: z, key.dim: z and value.dim: z[Expected query, key, and value to all be ragged on the same dimension, but got ragged dims z, z, and z, respectively.zMasks are not yet supported!)

isinstancer   
ValueError	is_nesteddtypedevicedim_ragged_idxtorchbool)r   r   r   r   	dropout_p	is_causalscales          W/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/nested/_internal/sdpa.py_validate_sdpa_inputr"      s    ul++	
#|,,	
 %..	

 ?(-? ?JM-? ?$)O? ? ?
 
 	

 {ci5;%+#=#=7$)K7 7>Ai7 7 %7 7 7
 
 	

 |sz!!U\U\%A%A9%*\9 9AD9 9!&9 9 9
 
 	

 yy{{Q#''))a--599;;??Yyy{{Y Y'*wwyyY YBG))++Y Y Y
 
 	
 CO++u/@EDU/U/Uc%c c),c c@E@Qc c c
 
 	
 7888     paramsreturnc                     | j                             d          }| j                            d          }| j                            d          }||k    o||k    S )Nr   )r   sizer   r   )r$   debugq_batch_sizek_batch_sizev_batch_sizes        r!   _check_batch_size_nestedr,   K   sX     <$$Q''L:??1%%L<$$Q''L
 <'HLL,HHr#   c                 "   d}| j                             d          }| j                            d          }| j                            d          }||k    o||k    }|r|dz  dk    r||k    s!|rt                              d|||           dS dS )N      r   zFor NestedTensor inputs, Flash attention requires q,k,v to have the same last dimension and to be a multiple of 8 and less than or equal to 256. Got Query.size(-1): %d, Key.size(-1): %d, Value.size(-1): %d instead.FT)r   r'   r   r   logwarning)r$   r(   max_sizequery_size_lastkey_size_lastvalue_size_lastsame_head_dim_sizes          r!   !_check_head_dim_size_flash_nestedr8   X   s    Hl''++OJOOB''Ml''++O=(O_-O  	q A%%(( 	KKX     u4r#   param
param_namec                    t          | t                    s
J d            | j        dk    r|rt                              d|           dS |                                 dk    r|rt                              d|           dS dS )Nzparam should be a jagged NTr   zMFused kernels do not support ragged num_head_dims, %s has a ragged num_heads.Fr   zAFused kernels do not support seq_len == 0, %s has a seq len of 0.T)r   r   r   r1   r2   _get_min_seqlen)r9   r:   r(   s      r!   :_check_for_seq_len_0_and_consistent_head_dim_nested_helperr=   r   s     e\**II,IIIIA 	KK_   u !## 	KKS   u4r#   c           
          t          | ||          }| |k    r| dk    s||k    r|dk    s||k    r+|dk    r%|r!t                              d||| ||||           dS dS )Nr   zzBoth fused kernels require query, key and value to have broadcastable %s, got Query %s %d, Key %s %d, Value %s %d instead.FT)maxr1   r2   )q_sizek_sizev_sizer:   r(   r3   s         r!   _try_broadcast_param_sizerC      s    666**H	8		!h6Q;;h6Q;; 	KKC
 
 
 u4r#   c                 X   | j         j        rt          | j         d|          nd}|sdS | j        j        rt          | j        d|          nd}|sdS | j        j        rt          | j        d|          nd}|sdS | j                             d          }| j                            d          }| j                            d          }||k    o||k    }|sU| j         j        s| j        j        s| j        j        r|rt                              d           dS t          |||d|          S dS )	Nr   TFr   r   r   zFBoth fused kernels do not support training with broadcasted NT inputs.z	num heads)
r   r   r=   r   r   r'   requires_gradr1   r2   rC   )	r$   r(   	q_is_safe	k_is_safe	v_is_safeq_num_headsk_num_headsv_num_headssame_num_headss	            r!   _check_for_seq_len_0_nestedrM      s    <!	BL'5	
 	
 	
    u :	BJu	
 	
 	
    u <!	BL'5	
 	
 	
    u ,##A&&K*//!$$K,##A&&K K/NK;4NN 
L&		z'		 |)		
  \   5(k;
 
 	
 4r#   c                 V    t           t          t          f}|D ]} || |          s dS dS NFT)r,   r8   rM   r$   r(   constraints
constraints       r!   _can_use_flash_sdpa_jaggedrS      sH     )#K
 "  
z&%(( 	55	4r#   c                 J    t           t          f}|D ]} || |          s dS dS rO   )r,   rM   rP   s       r!   _can_use_efficient_sdpa_jaggedrU      sE     #K "  
z&%(( 	55	4r#   c                    | j                             dd                                          rZ| j                            dd                                          r-| j                            dd                                          s|rt
                              d           dS | j        r|rt
                              d           dS dS )Nr      zGIf inputs are nested tensors they must be contiguous after transposing.FzENested tensors for query / key are not supported when is_causal=True.T)r   	transposeis_contiguousr   r   r1   r2   r   )r$   r(   s     r!   _can_use_math_sdpa_jaggedrZ      s    L""1a((6688	z##Aq))7799	 |%%a++99;;	
  	KKY   u  	KKW   u4r#   c           	         t                      s(t                      st                      st          j        S t          j        t          j        t          j        f}t          | ||||||          }|D ]}	|	t          j        k    r,t          |          rt          |          rt          j        c S |	t          j        k    r,t          |          rt          |          rt          j        c S |	t          j        k    r+t                      rt          |          rt          j        c S t                              d           t          |d           t          |d           t                              d           t          |d           t          |d           t                              d           t          |d           t          j        S )Nz)Memory efficient kernel not used because:T)r(   z(Flash attention kernel not used because:z'Math attention kernel not used because:)r   r	   r   r   ERRORFLASH_ATTENTIONEFFICIENT_ATTENTIONMATHr
   r   rS   r   rU   rZ   r1   r2   )
r   r   r   r   dropoutr   
enable_gqaorderingr$   backends
             r!   _select_sdp_backendrd   	  s    )++  !"" 
  	"&H sE9gy*UUF ' 'j000&v.. 23Mf3U3U 2!1111j444*622 67U8 8 6 "5555jo%%!! '&?&G&G '!&&&KK;<<<d3333"66666KK:;;;F$////vT2222KK9:::fD1111r#   qkvc                    t          | t                    st          d          |                                 l|                                                     t          j        | j                  }| 	                                }| 
                                j        d         }n|                                                     d                              t          j        | j                  }|                     d          }| 	                                }t          |d                                                   }|||fS )Nz<QKV must be nested for flash cumulative_seq_len calculation.)r   r   r   r/   )r   r   r   lengthsoffsetstor   int32r   _get_max_seqlenvaluesshapecumsumr'   intitem)re   cumulative_seqlen
max_seqlenn_elem
batch_sizes        r!   _cumulative_and_max_seq_len_nnzru   1  s    c<(( YWXXX
{{}}KKMM,,5;sz,RR((**
#A& KKMM  ##&&U[&LL 	 XXa[[
((**
&r*//1122j&00r#   tensorc                     t          | t                    sJ |                                 }| j        }|                    d          dz
  }|dk    rdS |d         }|dd          D ]}||k    r dS |}dS )Nr   r   TrW   F)r   r   rh   _stridesr'   )rv   rh   strides	n_tensorsprev_stridestrides         r!   !_is_safe_to_get_storage_as_tensorr}   M  s     fl+++++nnGoGQ!#IA~~t !*K!""+  &   55 4r#   Nnz	num_headshead_dimc                 f    | j         r|                                 S |                     |||          S N)r   rl   view)rv   r~   r   r   s       r!   _view_as_denser   l  s3      }};;sIx000r#   c                    |                      d          }|                     d          }|                     d          }|                      d          }|                     d          }|                     d          }||k    r||k    r||k    r||k    st          d          |                      d          }	|                      d          }
|                     d          }|                     dd          }|                    dd          }|                    dd          }t          |          \  }}}t          |          \  }}}|                                s#t          |          s|                                }|                                s#t          |          s|                                }|                                s#t          |          s|                                }t          |||	|
          }t          |||	|
          }t          |||	|          }|                                |	                                |
                                d}||||||||fS )Nr   r   z<This path is currently not implemented for jagged layout NT.r   rW   )rh   _max_seqlen_min_seqlen)r'   RuntimeErrorrX   ru   rY   r}   
contiguousr   rh   rk   r<   )r   r   r   r)   r*   r+   rI   rJ   rK   r   head_dim_qk
head_dim_vq_tk_tv_tcumulative_sequence_length_qmax_seqlen_batch_qNnz_qcumulative_sequence_length_kvmax_seqlen_batch_kvNnz_kvquery_buffer_reshapedkey_buffer_reshapedvalue_buffer_reshapedoutput_nt_infos                            r!   _sdpa_nested_preprocessingr     sm    ::a==L88A;;L::a==L**Q--K((1++K**Q--KL((\\-I-I{""{k'A'AJ
 
 	

 

1I**Q--KAJ
//!Q

C
--1

C
//!Q

C 	(,,	$ 	(,,	%  'H'M'M nn 'H'M'M nn 'H'M'M nn*3y+NN(fiMM*3	:NN ;;==**,,**,, N 	$%	 	r#   alignment_sizeslicec                     |                      d          }||z  dk    r| S |||z  z
  }t          j        j                            | d|g          } |r| dd|f         S | S )Nr/   r   .)r'   r   nn
functionalpad)rv   r   r   last_dim_size	pad_counts        r!   _pad_last_dimr   K  sv     KKOOM~%**-."@AIX $$Va^<<F ,c1]?*++Mr#   c                 b    ||n)t          j        d|                     d          z            }|S )Ng      ?r/   )r   sym_sqrtr'   )r   r    softmax_scales      r!   _calculate_scaler   ^  s0    ".EEEN3TVCW4X4XMr#   outc                 ^    | j         s%|                     d          |k    r| dd|f         } | S )Nr/   .r   )r   r'   )r   og_sizes     r!   _post_process_flash_outputr   d  s7    = "SXXb\\W44#qy.!Jr#   c                     t           j                                        sL| j        j        dk    r<t           j        j                                        }t          d |D                       S dS )Nmetac              3   d   K   | ]+}t          |          t          j        j        j        k    V  ,d S r   )typer   utilsflop_counterFlopCounterMode).0xs     r!   	<genexpr>z+_is_computing_meta_flops.<locals>.<genexpr>q  sI       
 
 GGu{/??
 
 
 
 
 
r#   F)	r   jitis_scriptingr   r   r   _python_dispatch _get_current_dispatch_mode_stackany)r   torch_dispatch_mode_stacks     r!   _is_computing_meta_flopsr   j  sx     9!!## 
(?(?K(IIKK 	"  
 
.
 
 
 
 
 	
 5r#   c                     | j         j        t          |           st          j                  s| |||fS fd} ||            ||           ||           ||          fS )a*  
    [Autocasting SDPA for NJT]

    Normal autocasting doesn't work for NJT+SDPA right now:
    * NJT intercepts the __torch_function__ call for scaled_dot_product_attention, which happens
      before we get to any aten ops or dispatcher logic; then the torch_function logic calls into
      efficient attention or flash attention. So, autocasting on the scaled_dot_product_attention
      op won't work because we never see that aten op.
    * If we put autocasting on `_flash_attention_forward`, then we'll get autocasting to run, but
      the kernel selection logic in torch_function handling (ie. jagged_scaled_dot_product_attention)
      won't work correctly: the kernel selection logic will run before autocasting, and choose
      a kernel based on the un-autocasted dtypes; but then autocasting will run and the actual
      attention computation will happen in a different dtype.

    An alternative is to just change the backend selection logic for SDPA+NJT to be autocast-aware
    and rely on autocasting to do the actual conversions for flash attention / efficient attention.
    However, by manually doing the actual autocast before the backend selection, we ensure that the
    autocast handling for backend selection doesn't diverge from the autocast handling for the
    actual dtype conversions.
    c                     | | S t          j                  }| j        j        r | j        |k    s| j        t           j        k    r| S |                     |          S r   )r   get_autocast_dtyper   is_floating_pointfloat64ri   )r   target_dtypedevice_types     r!   cvtz_autocast.<locals>.cvt  s]    9H/<<*	w,&&w%-''HttL!!!r#   )r   r   r   r   is_autocast_enabled)r   r   r   r   r   r   s        @r!   	_autocastr   x  s    4 ,#K&& ,e.G.T.T ,c5)++
" 
" 
" 
" 
" 3u::ss3xxUSS^^;;r#   c                 	   t          | |||          \  } }}}t          | ||||||           t          | t                    r*t          |t                    rt          |t                    sJ ddlm} |                                 dk    r|                                dk    r|                                dk    r| j        dk    rt          j	        | 
                                |
                                |
                                t          |t                    r|
                                n||||          }	 ||	|                                           S | j        p|j        p|j        }
t          | ||||||          }t          |           rt          j        }|t          j        k    r|                     d          }t%          | dd          }t%          |dd          }t%          |dd          }t'          | |          }t)          |||          \  }}}}}}}}t*          j        j                            |||||||||d|	          \  }}}}} |||d
         |d         |d                                       dd          }t5          ||          S |t          j        k    rt)          | ||          \  }}} }}}}}t*          j        j                            |                    d          |                    d          |                     d          d |||||t=          |          |
|	          \  }}!}"}#}$} ||                    d          |d
         |d         |d                                       dd          S |t          j         k    r?|                                 }%| j!        d         }&|j!        d         }'| j"        #                    dd           }(| j"        #                    dd           })d }* |*|           }  |*|          } |*|          }t+          j$        | ||||||	          d         }+ddlm%}, |+                    dd          &                                
                                }+|+'                    d|&|'          }+ ||+|%|(d n
 |,|(          |)d n
 |,|)                                        dd          }+|+S tQ          d          )Nr   )nested_view_from_values_offsetsr   r   )r   r   r   r    r/   r0   F)r    rh   r   r   )
min_seqlenrr   rW   r   rr   c                    | j         dd          | j         d d         z
  }t          j        | dd          }|                                                    t          |          d          }t          j                            t          |                    }|                    dd                                          }|S )Nr   r/   rW   r   )r   )	_offsetsr   rX   rl   splitlistnestedas_nested_tensorr   )jagged_layout_ntrg   rX   tensor_list
strided_nts        r!    get_strided_layout_nested_tensorzMjagged_scaled_dot_product_attention.<locals>.get_strided_layout_nested_tensor?  s    &/36F6OPSQSPS6TTG(8!Q??I#**,,224==a2HHK66tK7H7HIIJ#--a33>>@@Jr#   )_load_val_from_tensorz=No viable backend for scaled_dot_product_attention was found.))r   r"   r   r   $torch.nested._internal.nested_tensorr   r   r   Fscaled_dot_product_attentionrl   rh   rE   rd   r   r   r]   r'   r   r   r   r   opsaten_flash_attention_forwardrX   r   r^   _efficient_attention_forward	unsqueezero   squeezer_   _size_metadata_cacheget"_scaled_dot_product_attention_mathr   r   r   r   )-r   r   r   r   r   r   r    ra   r   outputcompute_logsumexpbackend_choicer   query_padded
key_paddedvalue_paddedog_scaler   r   r   r   r   r   r   r   	attention	logsumexpphilox_seedphilox_offsetdebug_attn_maskquery_reshapedkey_reshapedvalue_reshaped
log_sumexpseedoffsetmax_seqlen_qrh   d1d2min_seqlen_tensormax_seqlen_tensorr   attn_outr   s-                                                r!   #jagged_scaled_dot_product_attentionr     s    $-UC	#J#J E3yUIy)USSS 	5,''sL)) ul++  
 UTTTTT
 yy{{Q37799q==UYY[[1__ARVWAWAW/LLNNJJLLLLNN&0L&I&IX	  """y

 

 

 /.vu}}GGG+Ws/@WEDW(sE9iJ N  && 4
 $3333**R..$UAu55"3511
$UAu55#E511 '|ZNN		
!!() IN33!!() 4 
 
	
  439%%m4%m4	
 
 

 )Aq// 	 *)W===	:9	9	9 'uc599		
() IN77$$Q''""1%%$$Q''()	NN 8 
 
	
" /.a  9%%m4%m4	
 
 

 )Aq//	 
:?	*	* --//[^[_!155$
 
 "155$
 
	 	 	 1077..s330077;3y)Ye
 
 

 	ONNNNN %%a++6688??AA==R,,22 %, **+<== %, **+<==
 
 
 )Aq// 	 K
 
 	
r#   )Nr   FN)F)Nr   FNF)0loggingtypingr   r   r   torch.nntorch.nn.functionalr   r   r   torch.backends.cudar   r   r   r   r	   r
   torch.nn.attentionr   nested_tensorr   	getLogger__name__r1   Tensorr"   r   r,   r8   strr=   rC   rM   rS   rU   rZ   rd   ro   ru   r}   r   r   r   r   r   r   r   r    r#   r!   <module>r     s    " " " " " " " "                           * ) ) ) ) ) ' ' ' ' ' ' g!! )-
0 0<0	0 <0 %	0 0 0 0f
I 
IZ 
I 
I 
I 
I 
I j $    6 16 <%(	   4 RV    .8 8
 8D 8 8 8 8v	 	z 	4 	 	 	 	 : t     j $    (% % %P1 1%cSV@V:W 1 1 1 18el    >1L1"1/21>A1
\1 1 1 1lF F FRL*-6:
\   &  EL      +<<+<	+< <+< %	+<
 5<u|Xel5KKL+< +< +< +<d )-
A
 A
<A
	A
 <A
 %	A
 A
 A
 A
 A
 A
r#   