
    çg0>                        d dl Z ddlmZmZmZ ddlmZ  edd i          edej        dej        d	ej        d
ej        dej        f
d                        ZddZ	d Z
edej        dej        d	ej        dej        d
ej        f
d            ZddZd ZddZ G d de j        j                  Z G d d          ZdS )    N   )cdiv
heuristicsjit)languageEVEN_Kc                 ,    | d         | d         z  dk    S )NKTILE_Kr    )nargss    Y/var/www/html/ai-engine/env/lib/python3.11/site-packages/triton/ops/blocksparse/matmul.py<lambda>r      s    E#Jx8A=     TILE_MTILE_Nr   BLOCKc                 b   t          j        d          |z   }||dz  z  }t          j        d          }t          j        |dz             }t          j        |dz             }||z  t          j        d|          |z  z   }t          j        d|          }| ||z  z   ||z  z   |d d d f         |z  z   |d d d f         |z  z   }t          j        |dz             }||z  t          j        d|          |z  z   }t          j        d|          } |||z  z   ||z  z   |d d d f         |
z  z   | d d d f         |	z  z   }!t          j        ||ft           j                  }"t          |d|           D ]}#|r)t          j        |          }$t          j        |!          }%nJt          j        ||d d d f         |#k     d          }$t          j        |!| d d d f         |#k     d          }%|"t          j        |$|%t           j                  z  }"|||z  z  }|!||	z  z  }!|"                    |j	        j
                  }&t          j        d|          |z  }'t          j        d|          |z  }(|||z  z   ||z  z   |'d d d f         |z  z   |(d d d f         |z  z   })t          j        |)|&d	
           d S )Nr   r         dtypeg        )maskother	out_dtypeTr   )tl
program_idloadarangezerosfloat32rangedottor   
element_tystore)*ABC	stride_za	stride_ha	stride_ma	stride_ak	stride_zb	stride_hb	stride_bk	stride_nb	stride_zc	stride_hc	stride_mc	stride_ncr
   grid_offsetlutr   r   r   r   r   block_idoff_zoff_hstart_amoffs_amoffs_aka_ptrsstart_bnoffs_bnoffs_bkb_ptrsacckabcoffs_cmoffs_cnpcs*                                             r   _sdd_kernelrM      s(    }Q+-H8a<CM!EGC!GE wsQwH")Av"6"6">?Gi6""G
)

)
 !!!T'
Y
&' $'
Y
&	'F wsQwH")Av"6"6">?Gi6""G
)

)
 $'
Y
&' !!!T'
Y
&	'F (FF#2:
6
6
6C1a&!! 	% 	% 	EAAAWT111W%5%9DDDAWQQQW%5%9DDDArvabj1111&9$$&9$$qw!""A i6""U*Gi6""U*G	

)


Y

 !!!T'
Y
&
' $'
Y
&	
'B
 HRr   c
                     |                      d          dk    r-|                      d          dk    r|                                 } |                     d          dk    r-|                     d          dk    r|                                }|r
|| }} | | }}|rdnd}
|rdnd}| j        |
         |j        |         }}||k    rt          d| d| d          |	<t	          j        | j        d	         |j        d	         ||f| j        | j        
          }n)|	j        | j        d	         |j        d	         ||fk    sJ |	}|j        d         d|j        d	         g}t          |         | |||                      d	          |                      d          |                      |rdnd          |                      |rdnd          |                     d	          |                     d          |                     |rdnd          |                     |rdnd          |                     d	          |                     d          |                     d          |                     d          |d	|||d|dd           |S )Nr   r   r   zInner dimension mismatch (A: z vs B: )r   r   device       )r   r   r   r   
num_stages	num_warps)	stride
contiguousshape
ValueErrortorchemptyr   rS   rM   )rG   rH   trans_atrans_btrans_cspdimsblockr9   widthsouta_dimb_dimKaKbrI   grids                   r   
sdd_matmulrj   S   se   xx{{aAHHQKK1,,LLNNxx{{aAHHQKK1,,LLNN 4!1&;G!BBrE!BBrEWU^QWU^B	RxxIIIBIIIJJJ
{KSYq\5%@XYX`aaayQWQZ1ueDDDDDGAJ171:&D	1a	QXXa[[!((+>11Q"?"?wJ]!!\]A^A^	QXXa[[!((+>11Q"?"?wJ]!!\]A^A^	QXXa[[!((1++qxx{{
AsU2Uq    Hr   c                     |                      d                              |                                          }|                                }|d fS )NFas_tuple)nonzeror&   intrY   )layoutrb   rS   r9   s       r   sdd_lutrq   u   sH    
..%.
(
(
+
+F
3
3
7
7
9
9C
..

C9r   GROUP_SIZE_Mc                     t          j        d          }t          j        d          }t          j        d          }t          j        d          }t          j        |||||          \  }}t          j        d          }||dz  z   }t          j        |dz             }t          j        |dz             }t          j        |dz             }t          j        |dz             } ||z   }!t          j        |!dz             }"t          j        |"d          }"t          j        d|          }#t          j        d|          }$| ||z  z   |"|z  z   |#d d d f         |z  z   |$d d d f         |z  z   }%||z  t          j        d|          z   }&t          j        t          j        |&|z  |          |          }&t          j        |!          }'t          j        |'d          }'|'t          j        d|          z   }(|||z  z   | |z  z   |&d d d f         |
z  z   |(d d d f         |	z  z   })t          j        ||ft           j	                  }*|!dz  }!t          j        |!dz             }+t          j        |+d          }+t          j        |!          },t          j        |,d          },t          |d|           D ]}-t          j        |%          }.t          j        |)          }/|*t          j        |.|/t           j	                  z  }*|%|+z  }%|)|,|	z  z  })|!dz  }!t          j        |!dz             }+t          j        |+d          }+t          j        |!          },t          j        |,d          },|*                    |j        j                  }0||z  t          j        d|          z   }1||z  t          j        d|          z   }2|| |z  z   ||z  z   |1d d d f         |z  z   |2d d d f         |z  z   }3t          j        |3|0|2d d d f         |k     	           d S )
Nr   r   r   rU   r      r   r   r   )r   r   num_programs	swizzle2dr    multiple_ofr!   max_contiguousr"   r#   r$   r%   r&   r   r'   r(   )4r)   r*   r+   	stride_azr-   	stride_amr/   r0   r1   r2   	stride_bnr4   r5   	stride_cm	stride_cnDS0DS1r9   r   r   r   rr   r   pid_mpid_n	num_pid_m	num_pid_npidzheaderoffsetr
   columnr<   pincr:   r>   r?   parB   start_bkrC   pbrE   inc_ainc_brF   rG   rH   rI   rJ   rK   rL   s4                                                       r   _dsd_kernelr      s    M!EM!E""I""I<uiLQQLE5=D519_FWVaZ  F

AWVaZ  FGFQJE<Dwtax  H~h**Hi6""Gi6""G	
TI	
Y


!!!T'
Y
&
' $'
Y
&
'B
 fnryF333Gw}f E EvNNGwt}}H~h**H1f---G	
TI	
)


$'
Y
&
' !!!T'
Y
&
'B (FF#2:
6
6
6CAIDGD1HEN5!$$EGDMMEN5!$$E1a&!! 
) 
)GBKKGBKKrvabj1111
e
ei	q!!ua((ua((qw!""Avo	!V 4 44GfnryF333G	

)




 !!!T'
Y
&
' $'
Y
&	
'B
 HRqqq)C/000000r   c
                    |                      d          dk    r-|                      d          dk    r|                                 } |                     d          dk    r-|                     d          dk    r|                                }|||rdnd         z  }
|                    d          |                    d          }|                    |rdnd          | j        }}|}|rn|
}|r|
n}|	!t	          j        ||||f|| j                  }n|	j        ||||fk    sJ |	}d}fd}t          |         | |||                      d          |                      d          |                      |rdnd          |                      |rdnd          |                     d          |                     d          |                     |rdnd          |                     |rdnd          |                     d          |                     d          |                     |rdnd          |                     |rdnd          |
|f||t          |d          |d	d	d	d
 |S )Nr   r   r   r   rR      c                 6    t          | d                   gS )Nr   )r   )metaBS0BS3widths    r   r   zdsd_matmul.<locals>.<lambda>   s    c4>22E3? r   rT   rU   )r   r   r   r   rV   rW   rr   )
rX   rY   sizer   r\   r]   rS   rZ   r   min)rG   rH   r^   r_   r`   ra   rb   r9   r   rd   AS1BS1r   CS0CS1CS2CS3rI   r   ri   r   r   s           `           @@r   
dsd_matmulr      sy   xx{{aAHHQKK1,,LLNNxx{{aAHHQKK1,,LLNN
&g,1-
-C
&&))C
&&))C
&&g$1
%
%CGE
C
C
!##cC
!##cC
{Kc3,E!(KKKyS#sC00000F??????D	1a	QXXa[[!((+>11Q"?"?wJ]!!\]A^A^	QXXa[[!((+>11Q"?"?wJ]!!\]A^A^	QXXa[[!((+>11Q"?"?wJ]!!\]A^A^S# VCrNN%TU!    Hr   c                 	   t          j        | |rdnd          }t          j        |                              d          \  }}|                                }||z  }|r|                     d          }	n*|                     dd                              d          }	|	                    d          }
t          j        |          }t          j        |dd         d	          |dd<   t          j	        ||
dz
  t          j        |          z            }|	dddf         |z  }|
                                }|ddxx         |dd         z  cc<   ||z  }|                    dd                              d|          }||ddddf<   |dddfxx         |dz
  |z  z  cc<   |||dk                      |||dk             df<   |                    d          }|rt          j        |
| j        
          }nt          j        g t           j        | j                  }d}t#          |                     d                    D ]}| |ddddf         
                                                                }|                                }dt          j        || j        
          z   ||dk    <   t          j        |||j        |j        dk             z   dz
  f          }||z  }||z  |z  }|ddxx         |dd         |z  |z  z  cc<   |                    dd                              d|          }|r&||ddddf<   |dddfxx         |dz
  |z  z  cc<   n+||z  |ddddf<   |dddfxx         |dz
  |z  |z  z  cc<   |||dk                      |||dk             df<   |                    d          }|                    d          }|dz  |z  d|z  z   }||z  }t          j        ||||fd	                              d                                          }t          j        ||fd	                              d                                          }t          j        d|j        |j                  }t          j        ||f          }t          j        ||f          }|                    t           j                                      |          }||fS )a  
    Generates the look-up table for incrementing pointers in the DSD/DDS matmul.
    Example (BLOCK=32, STEP=16)
    [[1, 0, 0, 1, 0],
     [0, 1, 1, 0, 1],
     [1, 0, 1, 0, 0]]

    Then the offsets for A are
     [0 , 16, 32, 48] <- row 0
      \----/  \----/
      col=0   col=3
     [64, 80, 96, 112, 128, 144] <- row 1
      \----/   \----/  \------/
       col=1    col=2    col=3
     [160, 176, 192, 208]
    which leads to increments table
    [0, 16, 16, 16, || 64, 16, 16, 16, 16, 16, || 160, 16, 16, 16]

    Because B is dense, the offsets are
    [0, 16, 96, 112] <- row 0
    [32, 48, 64, 80]  <- row 1
    [0, 16, 64, 80]   <- row 2
    r   r   Trl   Fr   NrP   )dim)rS   rR   rU      )rS   r   )r\   sum	ones_likern   flatten	transposer   
zeros_likecumsumr   cloneviewrepeatr!   rS   tensorint64r$   longcatTstackrY   r"   r   typeint32r&   )rp   rb   steptransrS   sizeshead_idcol_idsegmentsnnz
num_blocksoffsetsB_idxB_incsdivA_idxcurrent_offsetzlayoutwmsumA_incsr   r   incspadr9   s                             r   dsd_lutr      s   0 If5/aaa00Eoe,,44d4CCOGVMMOOEt|H =nnen,,q!$$,,e,<<!Ju%%G,uSbSzq111GABBKi*q.EOG4L4L!LMMG 1IE[[]]F
122JJJ%*JJJ
4-C[[Q&&q#..FF111abb5M
111a4LLLS1W$$LLL',WX\-B'CF78a< !#$[[__F
  
#Z>>>Ru{6=IIIv{{1~~&& 	# 	#AQ111Wo++--2244G;;==D#$u|D'O'O'O#OGGaK IunwyQ7O&ORS&STUUEd"NNU]U"F
122JJJ%*u$u,,JJJ[[Q&&q#..F 1qqq!""uqqq!tqD((uqqq!""uqqq!tqD(500',WX\-B'CF78a< !#$[[__FKKNNEkC!e)+G#~H['8VW=1EEEJJ2NNYY[[F;'Q///44R88CCEED +bDJ
?
?
?C9dC[!!D
)VTN
#
#C
((5;


"
"6
*
*C:r   c
                 :    t          || | | | |||||	
  
        S N)rd   )r   )
rG   rH   r^   r_   r`   ra   rb   r9   r   rd   s
             r   
dds_matmulr   Z  s.    aKW'k65RUW\beffffr   c                   F    e Zd ZeeedZed             Zed             Z	dS )_matmulsdddsdddsc                    t          j        |         ||||||||	|
|
  
        }|                     ||           || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        |d u| _        |S r   )r   fnsave_for_backwardda_lutda_widthdb_lutdb_widthmodera   rb   r^   r_   r`   has_out)ctxrG   rH   r^   r_   r`   r   ra   rb   c_lutc_widthr   r   r   r   rd   rI   s                    r   forwardz_matmul.forwardg  s     JtQ7GWfeUT[adeeea###


	or   c                    | j         \  }}d\  }}| j        }| j        d         r`|d         |d         z   |d         z   }t          j        |         ||| j        | j         | j        | j        | j	        | j
        | j        	  	        }| j        d         r`|d         |d         z   |d         z   }t          j        |         ||| j         | j        | j        | j        | j	        | j        | j        	  	        }| j        r|nd }	||d d d d d d d d d d d d |	fS )N)NNr   r   r   )saved_tensorsr   needs_input_gradr   r   r`   r_   r^   ra   rb   r   r   r   r   r   )
r   dcrG   rH   dadbr   mode_damode_dbdouts
             r   backwardz_matmul.backwardz  s-     1Bx" 	?1gQ'$q'1GG$RCKS[#+WZWacfcl%(Z? ?B " 	?1gQ'$q'1GG$QOS[#+WZWacfcl%(Z? ?B[*rrd2tT4$d$dD$/ 	/r   N)
__name__
__module____qualname__rj   r   r   r   staticmethodr   r   r   r   r   r   r   c  sW        Jz	B	BB  \$ / / \/ / /r   r   c                       e Zd ZddZddZdS )matmulFc                 ^   |dvrt          d          || _        || _        || _        || _        || _        || _        |j        | _        t          |d          }| j        dk    r^t          |||          \  | _        | _        t          |||d|          \  | _        | _        t          |||d|          \  | _        | _        | j        dk    rit          |||| j         |          \  | _        | _        t          |||          \  | _        | _        t          |||| j        |          \  | _        | _        | j        dk    rkt          |||| j        |          \  | _        | _        t          |||| j         |          \  | _        | _        t          |||          \  | _        | _        d S d S )	Nr   z"Supported modes are: sdd, dsd, ddsrT   r   TFr   r   )NotImplementedErrorrb   r   r^   r_   r`   rp   rZ   ra   r   rq   r   r   r   r   r   r   r   )	selfrp   rb   r   rS   r^   r_   r`   r   s	            r   __init__zmatmul.__init__  s   ,,,%&JKKK
	l5"~~9'.vuf'E'E$DJ)0dF)S)S&DK)0eV)T)T&DK9'.vudDTV\']']$DJ)0)G)G&DK)0dlTZ)[)[&DK9'.vudDLRX'Y'Y$DJ)0$,FVX^)_)_&DK)0)G)G&DK r   Nc                     t                               ||| j        | j        | j        | j        | j        | j        | j        | j	        | j
        | j        | j        | j        |          }|S N)r   applyr^   r_   r`   r   ra   rb   r   r   r   r   r   r   )r   rG   rH   rd   rI   s        r   __call__zmatmul.__call__  sZ    MM!QdlDL$)UYU`bfbl*dl+t}+t}	 
 r   )FFFr   )r   r   r   r   r   r   r   r   r   r     s@        H H H H0     r   r   r   )r\    r   r   r   r   r   	constexprrM   rj   rq   r   r   r   r   autogradFunctionr   r   r   r   r   <module>r      s    % % % % % % % % % %       ==   <
 <
 /1l<
 EGL< |< .0\< < <  <~   D   G1
 G1
 /1lG1
 EGLG1 !lG1 46<G1 G1 G1 G1T# # # #L^ ^ ^Ng g g g*/ */ */ */ */en% */ */ */Z                   r   