
    çg3                         d dl Z ddlmZ ddlmZ ddlmZ d Zedej        dej        d	ej        fd
            Zedej        dej        d	ej        fd            Z	 G d de j
        j                  Z G d d          ZdS )    N   )jit)language)next_power_of_2c                 F    | dk    rdS | dk    rdS | dk    rdS | dk    rdS d	S )
N            i      i          )ns    Z/var/www/html/ai-engine/env/lib/python3.11/site-packages/triton/ops/blocksparse/softmax.py	num_warpsr      s?    CxxqCxxqCxxqDyyq2    ROW_SIZE
BLOCK_SIZEIS_DENSEc                    t          j        d          }t          j        d          }t          j        d          }|t          j        d          z  |z   }t          j        d|
          |z  }t          j        d|
          |z  }|||z  dz  z   }t          j        |dz             }t          j        |dz             }||z  }|||z   |z  |z  z  }|||z  |z  z  }|rt          j        d|
          }n[|dt          j        d          z  t          j        d          z  |z  z   }t          j        ||z   |z   ||k     d          }||z  |z   }||k     }t          j        ||z   |z   |t          d                     }|                    t           j                  }|}||z  }|M|||z  z  }|||z  z  }||z
  dz
  |z   }|dk    ||k     z  }t          j        |||z  z   |z   |d          }||z  }|                    t           j                  }t          j        ||k    |	z  t          d           |          }t          j	        |          }t          j
        | |z   |z   ||           d S )Nr   r	   r   maskotherinf        r   )tl
program_idnum_programsarangeloadfloattofloat32wheresoftmaxstore) OutA	stride_xzLUTRextent	stride_zr	stride_hrscale	is_causalr   r   r   hmzhmlane_nblock_nheadersizeoffsetoff_ansoff_lutstart_nr   aoutoff_lomask_lo
rel_logitss                                    r   _blocksparse_softmax_fwdrE      s    	aA
aA
aA	
R_Q	!	#BYq(##j0Fi8$$
2GB*$))F76A:DWVaZ  F	ME	fw*,z99E	a*n
**E +Yq(##1rq111BOA4F4FF*TT'#-'1$aPPPz!F*T>D
E	F"eEll]CCCA	RZA
C5LC}	Q]	Q]1*q.B&Q;6F?3WQV^f47#NNN
z
&&

C
(BFi'%,,
<
<C
*S//CHS5[6!3T222222r   c                    t          j        d          }t          j        d          }t          j        d          }|t          j        d          z  |z   }t          j        d|          |z  }t          j        d|          |z  }|||z  dz  z   }t          j        |dz             }t          j        |dz             }||z   |z  |z  }|||z  |z  z  }||k     }|||z  z   |z   }|||z  z   |z   }|rt          j        d|          }nW|dt          j        d          z  t          j        d          z  |z  z   }t          j        ||z   |z   |d          } | |z  |z   }t          j        ||z   |d          }!|!                    t           j                  }!t          j        ||z   |d          }"|"                    t           j                  }"t          j        ||k    |z  |!|!k    z  d|!          }!|!|"t          j        |!|"z  d          z
  z  }#|K|||
z  z  }|||z  z  }|	|z
  dz
  |z   }$|$dk    |$|	k     z  |z  }%t          j	        |||	z  z   |$z   |#|%           |#|z  }#| ||z  z   |z   }&t          j	        |&|z   |#|           d S )Nr   r	   r   r   r   r   )
r   r   r    r!   r"   r$   r%   r&   sumr(   )'DA
stride_zdxDOutstride_zdoutr)   stride_zoutr1   r,   DRr.   r/   r0   	stride_err2   r   r   r   r3   r4   r5   r6   r7   r8   r9   r:   r;   off_mnr   AsDOutsr=   r>   r?   r@   doutdarB   rC   DAss'                                          r   _blocksparse_softmax_bwdrU   J   s    	aA
aA
aA	
R_Q	!	#BYq(##j0Fi8$$
2GB*$))F76A:DWVaZ  Fw*,z9F
q:~++FT>D	q;		'B1|##f,E +Yq(##1rq111BOA4F4FF*TT'#-'1AFFFz!F*
V$c222A	RZA756>C888D772:D
"q&I%a0"a88A	
dRVAHa(((	)B	~
a)m
a)m1*q.B&Q;6F?3d:
a&j6)2G<<<<	eB q:~

&CHS6\2D))))))r   c                   P    e Zd Zed             Zed             Zed             ZdS )_softmaxc           	         t          j        g t           j        | j                  }|                                }t          | j        d                   D ]8}t          j        || |d d d d f                             d          f          }9||z  }t          j	        |          }t          j
        |d d         d          |dd <   |                     d          d d df         }t          j        ||fd                              d          }	t          j        |	|f                              t           j                                      |          }
|
t#          |                                          fS )	Ndtypedevicer   )dimr	   F)as_tupler   )torchtensorint64r[   clonerangeshapecatrG   
zeros_likecumsumnonzerostackviewtypeint32r$   intmax)layoutblockr[   _emptysizesr3   total_sizesoffsetscolumnsr9   luts              r   make_lutz_softmax.make_lut   sQ   bFMJJJv|A'' 	@ 	@AIufQ111Wo&9&9"&=&=>??EEem"5))l5":1555..%.00A6eW-1555::2>>i)**//<<??GGC))****r   c
                    |@t          |t          j                  r&|j        j        dk    sJ |                                }|j        d         }
|d         |d         |z  |
g}|dn|j        }|dn|                                }t          j        |          }t          |         |||                    d          |||d         |d         |d         |||t          |          |	t          |                     |                     ||           || _        || _        || _        || _        || _        || _        |j        | _        |	| _        || _        |S )Ncpur   r	   )r	   r	   r	   r	   r\   r   r   r   r   )
isinstancer_   Tensorr[   rk   itemrd   stride
empty_likerE   r   r   save_for_backwardspdimsrp   maxlutr1   	rel_shaperel_stridesrZ   	rel_dtypeis_denser2   )ctxr@   r1   rD   r2   r   rp   rv   r   r   Mgridr   r   rA   s                  r   forwardz_softmax.forward   s_   E5<!@!@<$----JJLLEGAJq	6!9u,a0$.$6LLJ<L	&0&8llj>O>O>Q>Qq!! &AHHQKK	"{1~{1~$V,,''		
 		
 		
 		
 	c3'''
	
	!%!
r   c                    | j         \  }}d }| j        d         r&t          j        | j        | j        |j                  }|j        d         }| j        d         | j        d         | j	        z  |f}t          j
        |          }t          |         ||                    d          ||                    d          ||                    d          | j        ||| j        d         | j        d         | j        d         | j        d         | j        | j	        t!          | j                  | j        t'          | j                             |d d |d d d d d d d d d d d d d d fS )Nr   rY   r   r	   r\   r   rz   )saved_tensorsneeds_input_gradr_   zerosr   r   r[   rd   r   rp   r   rU   r~   r1   r   r2   r   r   r   r   )r   rR   rA   rv   drr   r   rS   s           r   backwardz_softmax.backward   sQ    $S" 	TS]#-
SSSBIaL
1sz!}sy8!<d## &		!$++a..AIb!3?1#5sq7I3?[\K]My$SZ00\
++	
 	
 	
 	
 D$D$dD$dTXZ^`dfjlprvwwr   N)__name__
__module____qualname__staticmethodrw   r   r   r   r   r   rW   rW      si        + + \+      \ D x x \x x xr   rW   c                   &    e Zd ZddZdddddZdS )r'   Fc                     |j         | _        || _        || _        t                              | j        | j        |          \  | _        | _        || _        d S )N)	rd   r   ro   rp   rW   rw   rv   r   r   )selfro   rp   r[   r   s        r   __init__zsoftmax.__init__   sH    l
 ( 1 1$+tz6 R R$+ r   g      ?N)r1   rD   r2   c                    |'|j         |j         k    rt          d|j                    t                              ||||| j        | j        | j        | j        | j        	  	        }|S )Nz$relative position embedding must be )	rZ   
ValueErrorrW   applyr   rp   rv   r   r   )r   r@   r1   rD   r2   s        r   __call__zsoftmax.__call__   si    !j&6!'&A&AMAGMMNNNNN1eZDKUYU]_c_j=* *r   )F)r   r   r   r   r   r   r   r   r'   r'      sJ        ! ! ! ! $'45       r   r'   )r_    r   r   r   r   r   	constexprrE   rU   autogradFunctionrW   r'   r   r   r   <module>r      sO                     	 	 	 23 (*|23 *,	23
 (*|23 23 23 23j 6* (*|6* *,6* (*|6* 6* 6* 6*rOx Ox Ox Ox Oxu~& Ox Ox Oxd         r   