
    Χg?                       d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dl	m
Z
 d dlmZ ddlmZ  e ej        dd	                    Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd ZdmdZd Zd Z d Z!d Z"dddZ#	 	 	 	 	 	 dndZ$	 	 	 	 	 	 	 dodZ% G d d           Z& ee!          d"             Z'	 dpd$Z(dqd%Z)dddd&ddd'd(ej*        d)ej*        d*ej*        d+eej*                 d,e+d-eeee         ee         ee         f                  d.ee,         fd/Z-dddd&ddd'd(ej*        d)ej*        d*ej*        d+eej*                 d,e+d-eeee         ee         ee         f                  d.ee,         fd0Z. e            rd dl/Z/d dl0m1Z2 e/j3        d1e2j4        d2e2j4        d3e2j4        d4e2j4        d5e2j4        d6e2j4        fd7            Z5e/j3        d2e2j4        d3e2j4        d5e2j4        d6e2j4        d8e2j4        f
d9            Z6d: Z7d;d;dd&dd<d(ej*        d=ej*        d>ej*        d+eej*                 d,e+d-eeee         ee         ee         f                  fd?Z8dd&ddd@d)ej*        d*ej*        d+eej*                 d,e+d-eeee         ee         ee         f                  d.ee,         fdAZ9e/j3        dBe2j4        dCe2j4        fdD            Z:dmdEZ;	 	 	 drdGej*        dHej*        dIej*        dJeej*                 dKe<dLe+dMee<         fdNZ=e/j3        dOe2j4        dPe2j4        dQe2j4        dRe2j4        dSe2j4        dTe2j4        d6e2j4        fdU            Z>dVej*        dWej*        dXej*        dYej*        dZej*        f
d[Z?e/j3        d\e2j4        d]e2j4        dRe2j4        d^e2j4        dSe2j4        dTe2j4        d_e2j4        d6e2j4        fd`            Z@	 dsdVej*        dWej*        dbej*        dcej*        ddej*        deej*        d.e,dZej*        dfe+fdgZAe/j3        dhe2j4        die2j4        dje2j4        d2e2j4        d3e2j4        dke2j4        d5e2j4        d6e2j4        d8e2j4        d^e2j4        fdl            ZBdS dZ;dZ9dZ8dZ=dZ?dZAdZBdS )t    N)	lru_cache)OptionalTuple)	warn_once)
has_triton   )get_meta*TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE   c                 (    | st          |          d S N)
ValueError)condmsgs     T/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/sparse/_triton_ops.pycheckr      s      oo     c                 R    t          |j        t          j        k    |  d           d S )Nz@(): only BSR sparse format is supported for the sparse argument.)r   layouttorch
sparse_bsr)f_namets     r   check_bsr_layoutr      s7    		E$$SSS    r   c                 ^    t          |j        |k    o|j        j        dk    |  d           d S )Ncudaz9(): all inputs are expected to be on the same GPU device.)r   devicetype)r   r   r   s      r   check_devicer   !   sB    		F6qx}6LLL    r   c           	      ^   t          |                                dk    o|                                dk    |  d|                                 d|                                 d           |j        dd          \  }}|j        dd          \  }}t          ||k    |  d| d| d           d S )Nr   zc(): all inputs involved in the matrix product are expected to be at least 2D, but got lhs.dim() == z and rhs.dim() == .zw(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, got lhs.shape[-1] == z( which is not equal to rhs.shape[-2] == )r   dimshape)r   lhsrhsmklkrns          r   check_mm_compatible_shapesr+   (   s    			Q)37799> 	J 	J #			J 	J=@WWYY	J 	J 	J   IbccNEArIbccNEB	
b 	R 	R "	R 	RLN	R 	R 	R    r   c           	          t          |j        |k    o3|j        t          j        t          j        t          j        ft          | z   v |  d| d|j         d           d S )Nz\(): all inputs are expected to be of the same dtype and one of (half, bfloat16, float32) or z, but got dtype == r!   )r   dtyper   halfbfloat16floattuple)r   r   r-   additional_dtypess       r   check_dtyper3   9   s    		5 	SGZ5?P8QQS 	' 	'3D	' 	'G	' 	' 	'	    r   c           	          t          |          dk    sJ d fd}t           ||          |  d|d          d|d          d           d S )	Nr   c                     | | dz
  z   S Nr    )vs    r   is_power_of_twoz(check_blocksize.<locals>.is_power_of_twoG   s    QK  r   c                 <    d}| D ]}|dk    o |          o|}|S )NT   r7   )bres	blocksizer9   s      r   is_compatible_blocksizez0check_blocksize.<locals>.is_compatible_blocksizeJ   s=     	K 	KI?Ay'A'AJsCC
r   z(): sparse inputs' blocksize (r   z, r   z;) should be at least 16 and a power of 2 in each dimension.)lenr   )r   r>   r?   r9   s      @r   check_blocksizerA   D   s    y>>Q! ! !     
	** 	D 	D1 	D 	D1 	D 	D 	D    r   c                 x    t          |                                           dk    r|                                 S | S )a  Return input as a triton-contiguous tensor.

    A triton-contiguous tensor is defined as a tensor that has strides
    with minimal value equal to 1.

    While triton kernels support triton-non-contiguous tensors (all
    strides being greater than 1 or having 0 strides) arguments, a
    considerable slow-down occurs because tensor data is copied
    element-wise rather than chunk-wise.
    r   )minstride
contiguous)r   s    r   make_triton_contiguousrF   X   s2     188::! ||~~r   c                 |    	 t          j        d |D              S # t          $ r t          d|  d           Y d S w xY w)Nc              3   4   K   | ]}|j         d d         V  d S Nr"   r$   .0r   s     r   	<genexpr>z'broadcast_batch_dims.<locals>.<genexpr>m   s,      'F'F'F'F'F'F'F'Fr   Fz3(): inputs' batch dimensions are not broadcastable!)r   broadcast_shapes	Exceptionr   )r   tensorss     r   broadcast_batch_dimsrQ   k   sf    U%'F'Fg'F'F'FGG U U UeSSSTTTTTTUs    ;;c              '   |   K   |D ]6}t          d           g|                                z  }||| <   ||         V  7d S r   )slicer#   )r#   slice_rangerP   r   slicess        r   slicerrV   r   sQ        ++(!si r   c              '      K   |D ]N}t          d           g|                                z  }t          | |          D ]\  }}||||<   ||         V  Od S r   )rS   r#   zip)dimsrU   rP   r   sdd_slices          r   multidim_slicerr]   y   st        4[[MAEEGG#dF++ 	 	JAw}!d



 r   c               '   P   K   | D ] }|V  |                                 E d {V  !d S r   )rD   )rP   r   s     r   ptr_stride_extractorr_      sK        88:: r   c              #   n   K   dt                     cxk    rdk    sn J dt                    cxk    rdk    sn J dd l} fd}fd} |j         |             D ]Q}d t           |          D             }d t          ||          D             }|d d d         g ||          R V  Rd S )Nr      c               3   ^   K   t                    D ]\  } }t          d| |          V  d S )Nr   )rX   range)fgmg	full_gridgrid_blockss     r   generate_grid_pointsz.grid_partitioner.<locals>.generate_grid_points   sI      )[11 	# 	#FB2r""""""	# 	#r   c              3      K                                    D ]%\  }}t          t          || |                    V  &d S r   )itemsnextr]   )rU   r   t_dimstensor_dims_maps      r   generate_sliced_tensorsz1grid_partitioner.<locals>.generate_sliced_tensors   sS      (..00 	; 	;IAvvvq99::::::	; 	;r   c                 <    g | ]\  }}}t          ||z
  |          S r7   )rC   )rL   rd   gpre   s       r   
<listcomp>z$grid_partitioner.<locals>.<listcomp>   s9     
 
 
!+RCR
 
 
r   c                 :    g | ]\  }}t          |||z             S r7   )rS   )rL   rp   gs      r   rq   z$grid_partitioner.<locals>.<listcomp>   s*    GGGA%BF##GGGr   )r@   	itertoolsproductrX   )	rf   rg   rm   ru   rh   rn   
grid_pointgridrU   s	   ```      r   grid_partitionerry      sO     I####!######K  %%%%A%%%%%%# # # # # #; ; ; ; ; (i')=)=)?)?@ ; ;

 
/29j+/V/V
 
 
 HGZ1F1FGGG 44R4j:226::::::::; ;r   c                     dd d d         }||}n,d t          fdt          ||          D                       }t          |||          D ]^}} | |g|R   d S )N)i  r{   rt   c                 F    | |S t          dt          | |                    S r6   )maxrC   )rs   re   s     r   valid_grid_dimz%launch_kernel.<locals>.valid_grid_dim   s&    y	 1c!Rjj)))r   c              3   6   K   | ]\  }} ||          V  d S r   r7   )rL   rs   re   r~   s      r   rM   z launch_kernel.<locals>.<genexpr>   sD       
 
&+aNN1b!!
 
 
 
 
 
r   )r1   rX   ry   )kernelrm   rf   rg   cuda_max_gridrx   sliced_tensorsr~   s          @r   launch_kernelr      s    .ttt4M#	* 	* 	*  
 
 
 
/2;/N/N
 
 
 
 
 "2;" " & &~ 	t%n%%%%%& &r   c                    |                                                      d          }|                                                     d          }t          |                                                     d                    }d |D             }t          j        |j        d d         gd |D             R  d  |d          } |d          } ||j        dd                    }fd|D             }|||g|R S )Nr   c                 R    g | ]$}t          |                    d                     %S r   )rF   	unsqueezerK   s     r   rq   z"prepare_inputs.<locals>.<listcomp>   s+    MMM!%akk!nn55MMMr   c              3   4   K   | ]}|j         d d         V  d S rI   rJ   rK   s     r   rM   z!prepare_inputs.<locals>.<genexpr>   s,      ;;aQWSbS\;;;;;;r   c                 z    |                      ||z                                 dt          |          dz
            S )Nr   r   )broadcast_toflattenr@   )r   
batch_dimsinvariant_dimss      r   batch_broadcast_and_squashz2prepare_inputs.<locals>.batch_broadcast_and_squash   s;    ~~j>9::BBs:"
 
 	
r   rt   c           	      D    g | ]} ||j         d d                   S )r"   NrJ   )rL   r   r   batch_dims_broadcasteds     r   rq   z"prepare_inputs.<locals>.<listcomp>   sC        	#"1&<agbcclKK  r   )crow_indicesr   col_indicesrF   valuesr   rN   r$   )bsrdense_tensorsr   r   r   rP   r   r   s         @@r   prepare_inputsr      s`   ##%%//22L//##--a00K#CJJLL$:$:1$=$=>>FMM}MMMG #3SbS;;7;;;  
 
 

 .-,e L -,[:PRWXXK''&RSS(9 F      G
 f6w666r   c                    t          | |g|R  }|                                                    |dz             }|                                                    |dz             }|                                                    ||                                j        dd          z             }||j        dd          z   }t          j        |||||j                  S )Nr   r   r"   sizer   )	rQ   r   r   r   r   r$   r   sparse_compressed_tensorr   )r   r   rP   batch_shaper   r   r   r   s           r   broadcast_batch_dims_bsrr      s    &vs=W===K##%%22;3FGGL//##00u1DEEKZZ\\&&{SZZ\\5G5L'LMMF233'D)k6SZ   r   c                     | j         ^ }}}|||d         z  |d         ||d         z  |d         gz   }|                     |                              dd          S )Nr   r   r   r"   )r$   view	transpose)r   r>   restr'   r*   	new_shapes         r   tile_to_blocksizer      sh    'KT1a	Yq\!	Yq\!	 I 66)&&r2...r   c                     | j         dk     r |                     d          } | j         dk      | j         dk    r|                     d| j         dz
            } | j         dk    sJ | j                    | S )zReturn tensor as 3D tensor by either prepending new dimensions to
    the tensor shape (when ``tensor.ndim < 3``), or by collapsing
    starting dimensions into the first dimension (when ``tensor.ndim >
    3``).
    ra   r   )ndimr   r   r$   )tensors    r   	as1Dbatchr      su     +//!!!$$ +//{Q6;?33;!V\Mr   accumulatorsc                2   |d         }| j         dk    sJ | j        \  }}}|dk    r|dd         \  }}	|j         dk    sJ |j        \  }
}}||k    sJ |5|j        d         dz
  }t          j        |||f| j        | j                  }n|j        \  }}}||k    sJ ||k    sJ |dz  s|dz  s|dz  st          qt          |j        d         dz
            D ]R}||         }||dz            }t          ||          D ],}|	|         \  }}||xx         | |         ||         z  z  cc<   -Snt          | |||	|           |S |dk    rE|j        }t          |          }|j        \  }}}||z  dk    sJ |dd         \  }}}}}|d	         }|`||	                                
                                dz   |z  z   } t          j        g |dd
         | |R | j        | j                  }n|j        d
d         \  } }!|!|k    sJ |j        }"t          |          }||z  }|dz  s|dz  s|dz  st          (|                                 t          |          D ]}#t          |j        d                   D ]}||         
                                }$||         
                                }||dz            
                                }t          |$|          \  }%}&||#|%|%|z   |&|&|z   f         }'t          ||          D ]X}||         ||         }}t          |
                                |          \  }(})|'| |         ||#|(|(|z   |)|)|z   f         z  z  }'Ynt          | |||||||           |                    |"          S |dk    rt|j        }t          |          }|j        \  }}}||z  dk    sJ |dd         \  }}}}|d	         }|`||	                                
                                dz   |z  z   } t          j        g |dd
         | |R | j        | j                  }n|j        d
d         \  } }!|!|k    sJ |j        }"t          |          }||z  }|dz  s|dz  s|dz  st          7t          |          D ]%}#t          t          |                    D ]}*t          ||*         
                                |          \  }%}&|%|z  }+|&|z  },||+         
                                }-||+dz            
                                }.||#|%|%|z   |&|&|z   f         }'t!          t          |-|.                    D ]b\  }/}||,|.z  ||,z
  |-z  z   |/z            
                                }t          ||          \  }(})|'| |         ||#|(|(|z   |)|)|z   f         z  z  }'c'n7t          j        d|j        |j                  }t          | |||||||           |                    |"          S t%          |          )ad  Scattered matrix multiplication of tensors.

    A scattered matrix multiplication is defined as a series of matrix
    multiplications applied to input tensors according to the input
    and output mappings specified by indices data.

    The following indices data formats are supported for defining a
    scattered matrix multiplication operation (:attr:`indices_data[0]`
    holds the name of the indices data format as specified below):

    - ``"scatter_mm"`` - matrix multiplications scattered in batches
      of tensors.

      If :attr:`blocks` is a :math:`(* 	imes M 	imes K) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor,
      and :attr:`indices = indices_data['indices']` is a :math:`(*
      	imes 3)` tensor, then the operation is equivalent to the
      following code::

        c_offsets, pq = indices_data[1:]
        for r in range(len(c_offsets) - 1):
            for g in range(c_offsets[r], c_offsets[r + 1]):
                p, q = pq[g]
                accumulators[r] += blocks[p] @ others[q]

    - ``"bsr_strided_mm"`` - matrix multiplications scattered in
      batches of tensors and a tensor.

      If :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor, then
      the operation is equivalent to the following code::

        c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for i, r in enumerate(r_offsets):
                r0, r1 = divmod(r, N)
                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
                for g in range(c_indices[i], c_indices[i+1]):
                    p = p_offsets[g]
                    q0, q1 = divmod(q_offsets[g], N)
                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

    - ``"bsr_strided_mm_compressed"`` - matrix multiplications
      scattered in batches of tensors and a tensor. A memory and
      processor efficient version of ``"bsr_strided_mm"`` format.  If
      :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor, :attr:`others`
      is a :math:`(* 	imes K 	imes N)` tensor, :attr:`accumulators`
      is a :math:`(* 	imes M 	imes N)` tensor, then the operation is
      equivalent to the following code::

        c_indices, r_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for r in r_offsets:
                m = (r // N) // Ms
                n = (r % N) // Ns
                r0, r1 = divmod(r, N)
                c0, c1 = c_indices[m], c_indices[m + 1]
                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
                for i, p in enumerate(range(c0, c1)):
                    q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
                    q0, q1 = divmod(q, N)
                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

      Notice that the order of ``r_offsets`` items can be arbitrary;
      this property enables defining swizzle operators via
      rearrangements of ``r_offsets`` items..

    Auxilary functions are provided for pre-computing
    :attr:`indices_data`. For example,
    :func:`bsr_scatter_mm_indices_data` is used to define indices data
    for matrix multiplication of BSR and strided tensors.

    Parameters
    ----------
    blocks (Tensor): a 3-D tensor of first matrices to be multiplied

    others (Tensor): a tensor of second matrices to be multiplied. If
      ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch
      tensor of second input matrices to be multiplied. Otherwise, the
      second input matrices are slices of the :attr:`others` tensor.
    indices_data (tuple): a format data that defines the inputs and
      outputs of scattered matrix multiplications.

    Keyword arguments
    -----------------

    accumulators (Tensor, optional): a tensor of matrix product
      accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor
      is a 1-D batch tensor of output matrices. Otherwise, output
      matrices are slices of the :attr:`accumulators` tensor.
    r   ra   
scatter_mmr   Nr-   r   r;   bsr_strided_mmSPLIT_Nr"   bsr_strided_mm_compressedr   )r   r$   r   zerosr-   r   _scatter_mm2rc   r   r}   item_scatter_mm6zero_divmodr   r@   	enumerateemptyNotImplementedError)0blocksothersindices_datar   indices_formatPMsKs	c_offsetspqQKs_NsRMs_Ns_rg0g1rs   pqothers_shapeBKN	c_indices	r_offsets	p_offsets	q_offsetsmetar   MN_accumulators_shaper<   r_r0r1accq0q1jr'   r*   c0c1is0                                                   r   r   r     s   H "!_N;!IAr2%%$QRR(	2{a\
3Syyyy"Q&A ;B6<  LL ',KAsC"9999"99997 	Fb2g 	Fb 	FL,@9?1-122 = =q\q1u%r2 = =Aa5DAq OOOvay6!9'<<OOOO== BEEE	+	+	+|6"",1a2v{{{{;G;K8	9iDy/immoo**,,q0Q66A ;*,ss#*Q***&,v}  LL !&rss+EAr7777)/ ..'\7 	b2g 	b 	L,@   1XX 
Q 
Qyq122 	Q 	QA"1**,,B"1**,,B"1q5)..00B#B]]FB&q"rBw,R"W'DEC"2r]] Q Q(|Yq\1!'!!4!4Bvay6!R"r'\2R<2O+PPPQ	Q
Q 	 	 	   !3444	6	6	6|6"",1a2v{{{{0<QRR0@-	9iy/immoo**,,q0Q66A ;*,ss#*Q***&,v}  LL !&rss+EAr7777)/ ..'\7 	b2g 	b 	L,@1XX Q Qs9~~.. 
Q 
QA#IaL$5$5$7$7;;FBbAbA"1**,,B"1q5)..00B&q"rBw,R"W'DEC )%B-- 8 8 Q Q1%a"f!r/A&AA&EFKKMM!'1Bvay6!R"r'\2R<2O+PPPQ
QQ IOI4D  I 	 	 	   !3444 ".111r   c           
      .   ||||	|
|hd hk    rBt           j                                        }t          d| ||||f|dt           j        df          }| |j        d$i | |S | ||fdk    rX||fdk    rd}d}d}d	}d}
d	}	n||fd
k    rd}d}d}d	}d}
d	}	n||fdk    rd}d}d}d	}d}
d	}	n||fdk    rd}d}d}d}d}
d	}	n| ||fdk    rX||fdk    rd}d}d}d}d}
d}	nn||fd
k    rd}d}d}d	}d}
d}	nX||fdk    rd	}d}d}d	}d}
d	}	nB||fdk    rd}d}d}d	}d}
d	}	n,| ||fdk    rj||fdk    rd	}d}d}d}d}
d}	n||fd
k    rd}d}d}d}d}
d}	n||fdk    rd}d}d}d	}d}
d}	n||fdk    rd}d}d}d	}d}
d	}	n||fdk    rd}d}d}d}d}
d	}	n| ||fdk    ri||fdk    rd	}d}d}d}d}
d}	n||fd
k    rd	}d}d}d	}d}
d}	n||fdk    rd	}d}d}d	}d}
d	}	nq||fdk    rd}d}d}d	}d}
d	}	n\||fdk    rd	}d}d}d}d}
d	}	nG| ||fdk    r>||fdk    rd}d}d}d}d}
d}	n)||fd
k    rd}d}d}d}d}
d}	n||fdk    rd}d}d}d}d}
d	}	|.ddd	ddddddd	                    |d          }|dk    r|dk    rd}||z  }|t          |dk     rdnd|          }|t          |dk     rdnd|          }|
pd}
|	t          | |          dk    rdddd                    |d	          }	nvt          | |          dk    rdddd                    |d	          }	nGt          | |          dk    rdd	d                    |d	          }	nddd                    |d	          }	|pd	}||k    sJ t          ||                      ||k    sJ t          ||                      || k    sJ t          | |                       ||k    sJ t          ||!                      ||k    sJ t          ||"                      t          d$||||
|	|d#|S )%Nr   r         ?version)   r   r   )r;   r;   r   r;      )    r   r   r   )@   r   )   r   )   r   r      r   r   )   r   r   )r   r   )   r   r   )   r   r   r   )	r;   r   r   r   r   r   r   r   i    r   r   r   )r;   r   r   )r;   r   )TILE_Mr   )TILE_Nr   )r   r   )r   r   )r   r   )r   r   
GROUP_SIZE
num_stages	num_warpsr   r7   )	r   r   get_device_namer	   float16updategetrC   dict)r   r   r   r   r   r   r   r   r   r   r   extradevice_namer   r   s                  r   scatter_mm_metar     s    	J
CvMMj00221b"s+	
 
 
 DK  %   K q!9
""Bx8##

		bX%%

		bX%%

		bZ''

	AY*$$Bx8##

		bX%%

		bX%%

		bZ''

	AY+%%Bx8##

		bX%%

		bX%%

		bZ''

		bZ''

	AY+%%Bx8##

		bX%%

		bX%%

		bZ''

		bZ''

	AY+%%Bx8##

		bX%%

		bX%%

	 

 

 #a** 	 99dG	
gB~288RRR00~288RRR00qJq!99tA1--11"a88IIAYY$A1--11"a88IIAYY#A**2q11IIA**2q11IqJR<<<V333<<<R<<<V333<<<777D1$$$777777D1$$$777777D1$$$777     r   c                    |t           j        }|d}||	|
|hd hk    r#t           j                                        }| |||||dk    |dk    |dk    f}t	          d|||||f          }||dk    rt	          d||||df          }|t	          dg |d d         d|dd          R |||df          }t          |pi           D ]E}||         }|d         }|d	         }||z  }||z  dk    r||k    rt          |          }||z  |d	<   F| |j        di | |S t          d
| d|d|d|d|d|d|           |pt          ||z  d          }|pd}|
pd}
|	pd}	t          d|||
|	d|S )Nr   r   r   bsr_dense_addmmr   r   *ra   r   z@bsr_dense_addmm uses non-optimal triton kernel parameters for M=z K=z N=z Ms=z, Ks=z beta=z alpha=r   )r   GROUP_SIZE_ROWr   r   r7   )
r   r   r   r   r	   sortedr   r   r   r}   )r   r   r   r   r   betaalphar   r  r   r   sparsityr-   _versionr   r   keyr   matching_metamkeymeta_r*   split_ncs                           r   bsr_dense_addmm_metar    s^   ( }J7D6AAj0022!QB	419eqjAsK(E89T
 
 
 <HOO!3hs=S  D <$!)#bqb')3)QRR))!5#.	  M }233 - -%d+G	*Lq5A::!q&&;;D&'1fDODK  %   K
 }QR}}VW}}[\}}`b}}gi}}mq}}uz}}   (Q"WaG#(qNqJQI %	 
   r   c                   :    e Zd ZdZd Zd Zd Zed             ZdS )TensorAsKeyaS  A light-weight wrapper of a tensor that enables storing tensors as
    keys with efficient memory reference based comparision as an
    approximation to data equality based keys.

    Motivation: the hash value of a torch tensor is tensor instance
    based that does not use data equality and makes the usage of
    tensors as keys less useful. For instance, the result of
    ``len({a.crow_indices(), a.crow_indices()})`` is `2`, although,
    the tensor results from `crow_indices` method call are equal, in
    fact, these share the same data storage.
    On the other hand, for efficient caching of tensors we want to
    avoid calling torch.equal that compares tensors item-wise.

    TensorAsKey offers a compromise in that it guarantees key equality
    of tensors that references data in the same storage in the same
    manner and without accessing underlying data. However, this
    approach does not always guarantee correctness. For instance, for
    a complex tensor ``x``, we have ``TensorAsKey(x) ==
    TensorAsKey(x.conj())`` while ``torch.equal(x, x.conj())`` would
    return False.
    c                 \   d }t          j        |          | _        |j        t          j        u r ||          | _        n|j        t          j        t          j        hv r@ ||	                                           ||
                                          f| _        ns|j        t          j        t          j        hv r@ ||                                           ||                                          f| _        nt          |j                  t!          | j                  | _        d S )Nc                     | j         j        s| j         j        rJ | j                     |                                 |                                 | j        |                                 | j         fS r   )r-   is_floating_point
is_complexdata_ptrstorage_offsetr$   rD   )objs    r   get_tensor_keyz,TensorAsKey.__init__.<locals>.get_tensor_keyD  sb     	3Wsy7KWWciWWW""$$	

	 r   )weakrefref_obj_refr   r   stridedr  
sparse_csrr   r   r   
sparse_csc
sparse_bscccol_indicesrow_indicesr   hash_hash)selfr  r  s      r   __init__zTensorAsKey.__init__C  s   	 	 	&  C((:&&%~c**DHHZE,e.>???s//1122s0011DHH ZE,e.>???s//1122s0011DHH
 &cj111$(^^


r   c                     | j         S r   )r#  r$  s    r   __hash__zTensorAsKey.__hash__h  s
    zr   c                 t    t          |t                    sdS | j        |j        | |u S | j        |j        k    S )NF)
isinstancer  r  r  )r$  others     r   __eq__zTensorAsKey.__eq__k  sC    %-- 	58uy0 5= x59$$r   c                 *    |                                  S )z'Return object if alive, otherwise None.)r  r'  s    r   r  zTensorAsKey.objt  s     }}r   N)	__name__
__module____qualname____doc__r%  r(  r,  propertyr  r7   r   r   r  r  ,  sg         ,#$ #$ #$J  % % %   X  r   r  )maxsizec	           	      |	   |j         }	|	J |	                                |	                                }}
|
j        }t          j        }| dk    rd||z  }g }t	          j        |||          |z  }t          ||z            D ]}|
|                                         }|
|dz                                            }||k    r@|	                    |||         ||z  z  
                    |          |                    ||z
            z              t	          j        |          }|
                                }|                                }|||z  z  }||z                       d          }|
}||                             |          }|                    dd          \  }}||         }| |||fS | dk    r||z  }g }g }t	          j        |||          |z  }t          ||z            D ]}|
|                                         }|
|dz                                            }||k    r@|	                    t	          j        ||||          
                    |                     |	                    |||         ||z  z  
                    |          |                    ||z
            z              t	          j        |          }|
                                }|                                }|||z  z  }||z                       d          }t	          j        |
d d         t	          j        ||                             |          d          f          }t	          j        |          }| ||||fS | d	k    r'|}dg}g }t          |          D ]}t          ||z            D ]}|
|                                         }|
|dz                                            }t          ||z            D ]}|	                    |d         |z   |z
             t          ||z
            D ]J} || z   }!||!                                         |||z  z  z   ||z  z  |z   }"|	                    |!|"g           K͌| t	          j        |||          t	          j        |||          fS t'          d
| d          )Nr   r   r   rt   T)
descendingstabler   r   r   zInvalid indices_format=z>. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm)r  r   r   r   r   int32arangerc   r   appendrepeatrepeat_interleavecatdiffnonzeror   sortcumsumr   r   )#r   r   r   r   r   r   nbatchesr   compressed_sparse_tensor_as_keyr   r   r   r   indices_dtyper   q_offsets_lstr<   r'   r   r   r   crow_indices_diffnon_zero_row_indicesar   r   nnz_per_rowindicesp_offsets_lstr   
pq_offsetsr*   r   r   r   s#                                      r   _bsr_scatter_mm_indices_datarL  z  sI    *
-C??? # 0 0 2 2COO4E4E+L FKM444'\LfEEEJqBw 	 	Aa%%''Ba!e$))++BRxx  RU#rAv.66w??%%b2g../    Im,,	(--//088:: BF+ULL$$	 	'(<=OOPWXX*//4/MMWg&		9i@@	+	+	+'\LfEEEJqBw 	 	Aa%%''Ba!e$))++BRxx  R=HHHOOPWXX     RU#rAv.66w??%%b2g../    Im,,	(--//088:: BF+ULL$$	IRaR %&:;MMgVV 
 
	 Im,,		9iKK	<	'	'C	
x 		2 		2A17^^ 2 2!!_))++!!a%(--//qBw 2 2A$$Yr]R%7"%<==="27^^ 2 2F(^0022Q!r']BqBwORSS"))1a&1111222 L-GGGL=HHH
 	
 f~fff
 
 	
r   r   c                    |                                  dk    sJ | j        dk    sJ |                                 }|                                 }|                                 j        dd         }| j        \  }}|\  }	}
|j        dd         \  }}||k    sJ |j        dd                                         }t          ||||	|
fi |}d|vr3|                    | j	        t          j        t          j        hv            |d         }t          |||||	|
||t          |           	  	        }|dk    r|                    d	
           ||fz   S |dk    r|                    d
           ||fz   S |S )zkComputes indices data for :func:`scatter_mm` used in BSR and
    strided tensor matrix multiplication.
    r   r   r"   N
allow_tf32rN  r   r   T)is_compressedr   F)	dense_dimr   r   r   r   r$   numelr   r   r-   r   r   r/   rL  r  )r   r+  r   
meta_inputr   r   r>   r   r   r   r   K_r   rA  r   r   r   s                    r   bsr_scatter_mm_indices_datarU    s    ==??a8q====##%%L//##K

"233'I9DAqFBKEB7777{3B3%%''H1aB99j99D:%%syU]EN,KKLLL9oG/1aR7K<L<L L 444$'''tg%%	+	+	+%(((tg%%r   c           
         | j         dk    sJ |j         dk    sJ | j        d         | j        d         |j        d         }}}|                                 j        dd         }|t          | |d          }|d         }|5t	          j        g |j        dd         ||R | j        | j                  }|j        }	t          |          }| 	                                dk    r|
                                 n|d	v r;|
                                 t          |                                 |||
           n|dk    r|j        dd                                         }
t	          j        |
|z  |d         z  |z  |d         z  |d         |d         f| j        | j                  }t          |                              dd                              |
||d         z  |d         ||d         z  |d                                       dd                              dd          }t          |                                 |||
           |                    |                    d|
||d         z  ||d         z  f                              dd                              |
||                              dd                     nt+          |          |                    |	          S )zBSR @ strided -> stridedr   r"   rt   Nr   )r   r   r   >   r   r   r   r   r   )ra   r   r   r   )r   r   ra   r   )r   r$   r   rU  r   r   r-   r   r   _nnzr   r   rR  r   r   r   movedimr   copy_	unflattenreshaper   )r   r+  r   outr   r   r   r>   r   	out_shaperA  r   r   s                r   bsr_scatter_mmr^    s    8q====:????2	"u{2BB

"233'I2'B
 
 
 "!_N
{k'ek#2#''B''sy
 
 
 	I
C..C
xxzzQ			J	J	J		3::<<3GGGGG	<	'	';ss#))++{21-2ilB!!
 ):
 
 
 eYr2Til"!il"!  Wl  WQ]] 	 	3::<<LQQQQ		""HbIaL0"	!2DE  Wl  WXr2&&Yr2		
 		
 		
 		
 ".11188Ir   Fr  r  r\  skip_checksmax_gridr   inputr   denser\  r`  ra  r   c                b   ||j         t          j        u rd}	|                                }
|
                                dz
  }|j        |         }|j        d         }t          |	||          }t          j        |||fz   t          j        |j	                  }t          | ||||||||	  	        S )N_int_bsr_dense_addmmr   rt   r   r_  )r-   r   int8r   r#   r$   rQ   r   r7  r   r   )rb  r   rc  r  r  r\  r`  ra  r   r   r   
batch_ndimr   r   original_batch_dims_broadcasteds                  r   re  re  E  s     {u{ej00'''))!%%''!+
Ij!KO*>vsE*R*R'k+q!f4+<
 
 

 
 
 
 
r   c                   ! d}	|                                 }
|                                }|                                }|                                dz
  }|j        ||dz            \  }}|
j        |dz   |dz            }|j        d         }|+t          |	||          }|                    |||fz             }|                                dk    sdk    s|dk    s|dk    s|dk    rMdk    r|                                 n0|	                    |            dk    r|
                               |S ft          d|                                |d         z  |d         z  ||z  z  z
  d          }t          ||||d         |d         ||j        	  	        |}t          || ||          \  }}}
} }}|\                      d|z            }||z   |}t!          | f          }t!          | f          }t!          |  f          } t"          j        t&          j        t"          j        t&          j        t"          j        t&          j        t"          j        t&          j        t"          j        t&          j        t"          j        t&          j        i|j                 !|                    d          }|                    d          dz
  }|                    d	          }|||f}|?t5          |d d         d d d                   d
dt7          |d d                   z
  z  z   }nd }|
d|d|d| d|d|di}dk    sJ  !fd}t9          ||||           |                                |                                k    r-|	                    |                    |j                             |S )Nr   r   r   ra   rt   r   )r  r-   r   r   r   r   NNr   Nrt   )r   r   )r   r   Nc                     t          |          g t          | R dk    dk    dk    t          j        k    d d S )Nr   r   )beta_is_onebeta_is_nonzeroalpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COLrN  	acc_dtype)_bsr_strided_addmm_kernelr_   tlfloat32)	rx   r   BKBMBNr  r  dot_out_dtyper   s	     r   r   zbsr_dense_addmm.<locals>.kernel  s    !$' 	
!>2	
	
 	
 	
 	 AI!$
2#	
 	
 	
 	
 	
 	
 	
r   )r   r   r   r#   r$   rQ   	new_emptyrW  r   rY  mul_roundr  r-   r   r   r   r   r   rv  rw  r/   float64rf  r7  r   r1   r@   r   r  r   )"rb  r   rc  r  r  r\  r`  ra  r   r   r   r   r   rg  r   r   r>   r   rh  r  
out_backupr   out_untiled	n_batchesn_block_rowsn_block_colsrf   rg   rm   r   rx  ry  rz  r{  s"      ``   `                     @@@@r   r   r   j  s    FZZ\\F##%%L//##K!!##a'J9Z*q.01DAqZ!^j1n<=IBA {*>vsE*R*R'oo=AFGG
xxzzQ%1**Q!q&&AFF199IIKKKKIIeqyy
|SXXZZ)A,61EQOOQRSS#aLaL)

 

 

 J;IUE3< <8L+vueS FBhhy!r'**G	
gBK
C"b
*
*Ceb"X..Eeb"X..E 	rz
rzrz
BHRX 
iM 

1I$$R((1,L::b>>LL,7IHRaRL2.//'QXbqb\ARAR=R2SS 	m_{}[O A::::
 
 
 
 
 
 
 
 
 
 
  &/9kBBB
||~~,,.... 	))**:;;<<<r   IS_BETA_ZEROrq  rs  TILE_Krt  rN  c            
         t          j        d          } t          j        d          }!||| z  z   ||!z  z   }"t          j        |"          }#t          j        |"|z             }$|$|#z
  }%|%dk    rd S t          j        d|          }&t          j        d|          }'||| z  z   |	|#z  z   |
|&d d d f         z  z   ||'d d d f         z  z   }(||| z  z   ||#z  z   })||| z  z   ||!z  z   ||&d d d f         z  z   }*||| z  z   ||'d d d f         z  z   }+t          j        d|          },t	          |%          D ]2}-t          j        ||f|          }.t          j        |)          }/t	          d||          D ]}0|0|,z   }1|1|k     }2t          j        |*||1d d d f         z  z   |2d d d f         d          }3t          j        |+||/z  z   ||1d d d f         z  z   |2d d d f         d          }4|.t          j        |3|4||          z  }.|r|.| z  }.n| |.z  |t          j        |(          z  z   }.t          j        |(|.                    |j	        j
                             |(|	z  }(|)|z  })4d S )Nr   axisr   r-           maskr+  rN  	out_dtype)rv  
program_idloadr8  rc   r   dotstoretor-   
element_ty)5r  r  r  rq  rs  kr  
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stridemat1_ptrmat1_batch_stridemat1_tiled_row_stridemat1_tiled_col_stridemat1_row_block_stridemat1_col_block_stridemat2_ptrmat2_batch_stridemat2_tiled_row_stridemat2_tiled_col_stridemat2_row_block_stridemat2_col_block_stridert  rN  	batch_pidrow_block_pidcrow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangecol_block_arangevalues_block_ptrscol_index_nnz_ptrmat1_block_ptrsmat2_block_ptrsk_tile_arange_	acc_block	col_blockk_tile	k_offsetsmask_k
mat1_block
mat2_blocks5                                                        r   _sampled_addmm_kernelr    sv   F Mq)))	1--- ')34!M12 	 
 W455
'"9<O"OPP "J.a<<F9Q669Q66 !I-.*,- &(8D(AAB &(8qqq(AA	B 	 &23 :-. 	 )+,#m34 $&6qqq$w&??@ 	 )+,#&6tQQQw&??@ 	 	!V,,w &	4 &	4A-!?yQQQI  122I1f--  "]2	"QW#&;iaaa>P&PPaaa  
  W#+i78+i4.@@A  4  
 RV
zY   		  RU"		!I-rw?P7Q7Q0QQ	 H&	Z5E5P(Q(QRRR !22!33M&	4 &	4r   r  c                    t          j        d          }t          j        d          }t          j        d          }t          j        d          }t          j        d          } t          j        |||| |          \  }}|||z  z   ||z  z   }!t          j        |!          }"t          j        |!|z             }#|#|"z
  }$|$dk    rd S t          j        d|          }%t          j        d|          }&| ||z  z   ||"z  z   ||%d d d f         z  z   ||&d d d f         z  z   }'|||z  z   ||z  z   ||&d d d f         z  z   ||%d d d f         z  z   }(|||z  z   ||z  z   ||z  z   ||%d d d f         z  z   ||%d d d f         z  z   })||	|z  z   |
|"z  z   }*t          j        ||f|          }+t          |$          D ]i},t          j        |'          }-t          j        |*          }.t          j        |(||.z  z             }/|+t          j        |-|/||          z  }+|'|z  }'|*|
z  }*jt          j	        |)|+
                    |j        j                             d S Nr   r  r   r   r  r  )rv  r  num_programs	swizzle2dr  r8  r   rc   r  r  r  r-   r  )0r  r  r  r  r  r  r  r  r  r  r  	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_striderq  rs  rt  rN  r  r  r  col_block_pidr  r  r  r  r  r  r  r  r  dense_block_ptrsoutput_ptrsr  output_acc_blockr  values_blockdense_row_idxdense_blocks0                                                   r   "_bsr_strided_dense_rowspace_kernelr  d  sD   \ Mq)))	1---1---A...A...')|=,n(
 (
$}
 ')34!M12 	 
 W455
'"9<O"OPP "J.a<<F9Q669Q66 !I-.*,- &(8D(AAB &(8qqq(AA	B 	  9,-$}45 %'74'@@A %'7aaa'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8qqq(AAB 	 &23 :-. 	 8]M$B)TTTw 	4 	4A7#455L G$566M' #9M#II K
 kjI! ! ! 
 !22!33 	.11*2B2MNNOOOOOr   c           
          |                     d          }|                     d          dz
  }||f}|?t          |d d         d d d                   ddt          |d d                   z
  z  z   }nd }|d|d|d|	d|
di}|j        t          j        t          j        fv rt          j        d	nt          j	        d
 fd}t          ||||           d S )Nr   rt   r   r   r   )r   N)r   rt   )r   rl  TFc                 X    t          |          g	t          | R ddd d S )Nr   r   )rt  rN  r   r   )r  r_   )
rx   r   rt  rN  r  r  r>   is_beta_zeror  tile_ks
     r   r   z)_run_sampled_addmm_kernel.<locals>.kernel  sw    !$' 	
   &~6  $%     r   )r   r1   r@   r-   r   r.   r/   rv  rw  r  r   )r  r  r  r>   r  r  r   r   r   mat1mat2ra  r  r  rf   rg   rm   r   rt  rN  s   ``````            @@r   _run_sampled_addmm_kernelr    s6    KKNN	#((,,q0-	!TTrT 233gSRTSTRTEVEVAV6WWKKKI'')
 <EJ777
IJJ
IJ	 	 	 	 	 	 	 	 	 	 	 	 	foy+FFFFFr   g      ?)r  r  r\  r`  ra  r  r  c                l   d}t          ||            t          || ||          }	|st          ||| j                   t          ||| j                   |dk    r)| j        t
          j        u rt          d| d| d           | j        t
          j        ur-t          ||| j                   t          ||| j                   nt          |||j                   t          |||           |t          ||           t          |||j                   t          ||| j                   t          |j
        |	j
        k    o)|                                |                                 k    | d|	j
         d|	                                 d|j
         d	|                                 	           ||	                    |j        d
          }n|                    |	           |                                dk    s|                                dk    r|S |                                j
        dd          }
|                    d          }|                    d          }|                    d          }|dk    s|dk    r)|                                                    |           |S |}t%          |||          \  }}}}}t'          ||
d         |f          }t'          |||
d         f          }t)          |
 }t+          |||dk    |
||||||||           |                                                                dd          |                                dd          k    rQ|                                                    |                    |                                j
                             |S )Nsampled_addmmr  Fz(): having beta == z3 not equal to 0.0 with boolean mask is not allowed.z!(): Expects `out` to be of shape z and with nnz equal to z but got out.shape = z and out.nnz = T)copyr   r"   rt   r   r   )r   r   r   r   r-   r   boolr   r3   r+   r$   rW  r  rY  rR  r   r   r}  r   r   r}   r  rD   r[  )rb  r  r  r  r  r\  r`  ra  r   input_broadcastedr>   r'   r*   r  r  r   r   r   r  s                      r   r  r    s    !'''4VUD$OO 	u|444u|444s{{u{ej88kk$kkk   {%*,,FD%+666FD%+6666FD$*555&vtT::: ---VS$+666FC555I!2!88WSXXZZ5::<<=W R R@Q@W R R->-C-C-E-ER R+.9R REHXXZZR R   ;#&&tz&==CCII'(((99;;!sxxzzQJJJLL&rss+	IIbMMIIbMMIIbMM C<<166JJLLd###J 
8FsDRV8W8W5k64 	!a'899 9Q<'899i!CK	
 	
 	
$ %%'',0DDD%%fnnZ5F5F5H5H5N&O&OPPPr   )r\  r`  ra  r   c                   d}| j         dd          \  }}|st          ||            t          || |j                   t	          || |j        t          j        f           t          || |           |	                    d          }	| 
                                j         dd          \  }
}t          ||
|f           t          |	dz   | d|	 d           n|j         dd          \  }}	t          || |          }|x|sv|||	fz   }t          |j         |k    d| d|j          d	           t          |                                p'|                    dd                                          d
           ||                    |||	fz             }|                                 dk    r|                                S t'          || |dd|          S )Nbsr_dense_mmr"   rt   r;   z(): dense.size(-1) == z should be divisible by 16z9bsr_dense_mm(): `out` argument has wrong shape, expected z
, but got r!   zbsr_dense_mm(): only row-major/col-major `out` arguments are supported, i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) should be True.r   r   )r  r  r\  )r$   r   r   r   r3   r-   r   rf  r+   r   r   rA   r   rQ   is_contiguousr   r|  rW  r   r   )r   rc  r\  r`  ra  r   r   r'   r(   r*   	row_blockr  r)   rh  expected_out_shapes                  r   r  r  v  s     	"##2 	%VS)))el333U[5:-@@@&vsE:::

2A#&::<<#5bcc#: IyFY	$:;;;F
NNNNN   
 K$EB*>vsE*R*R'?;?!@Aq6!I	//G.G G:=)G G G  
 !!##Ls}}R'<'<'J'J'L'L"   ;//"AQF"JKKC 88::??99;; sCaaSIIIIr   MAX_ROW_NNZTILEc                    t          j        d          }t          j        d          }t          j        d          }| ||z  z   ||z  z   }t          j        |          }t          j        ||z             }||z
  }|dk    rd S t          j        d|
          }|||z  k     }|||z  z   ||z  z   ||z  z   }t          j        ||z   |t	          d                                         t           j                  }t          j        |d          }t          |
|	|
          D ]}||
z  }|||z  k     }t          j        ||z   |t	          d                                         t           j                  }t          j        |d          }t          j	        ||k    ||          }t          j
        ||z
            }t          j        |d          }t          |
|	|
          D ]}||
z  }|||z  k     }t          j        ||z   |t	          d                                         t           j                  }t          j
        ||z
            }|t          j        |d          z  }t          j        ||z   ||z                      |j        j                  |           t          |
|	|
          D ]}||
z  }|||z  k     }t          j        ||z   |t	          d                                         t           j                  }t          j
        ||z
            }t          j        ||z   ||z                      |j        j                  |           d S )Nr   r  r   r   infr  )r  )rv  r  r  r8  r0   r  rw  r}   rc   whereexpsumr  r-   r  )r  r  r  r  r  r  values_nnz_col_block_strider  r  r  r  r  row_block_offset_pidr  r  r  r  r  
row_aranger  curr_row_values_ptrsrow_tilemax_row_valuer  curr_max_row_valuenumdenoms                              r   _bsr_softmax_kernelr    s    Mq)))	!}!4441--- ')34!M12 	 
 W455
'"9<O"OPP "J.a<<FYq$''
Gi// !I-.%(<<= 9$% 	 7 :-Du
 
 

"RZ.. 	 xa000t[$// 		 		A$J) 33Dw$z1U5\\M  bnn  "$q!9!9!9H 22MCU MM
 fX-..s###t[$// 	) 	)A$J) 33Dw$z1U5\\M  bnn  &M122CRVCa((((EE 	 :-5[Z-899	
 	
 	
 	

 t[$// 	 	A$J) 33Dw$z1U5\\M  bnn  &M122CH$z1u  !1!<==    	 	r   c                    d}t          ||            t          || | j                   |                                 dk    s|                                 dk    r|                                 S | j        dd          \  }}|                                 }|                                 j        dd          \  t          j	        |          nt          j	                  | 
                                                    d                              dd          }|                                                     dd                                          r'|                                                                 }n|                                 }|                    dd                                                              d                              dd                              d|z            }|j        d         |z  f}d }	|dd df         d|d	i}
fd
}t#          ||
||	            |                    d|                              dd          j        |                                 j         }t%          j        | 
                                                                |                                                                 || j        | j                  S )Nbsr_softmaxr   r"   r   rl  rt   .rk  rj  c                 f    t          |          g t          | t          d          R   d S )Ni   )r  r_   rC   )rx   r   r  max_row_nnzr  s     r   r   zbsr_softmax.<locals>.kernel3  sf    % %~6  	 E;''     r   r   )r   r3   r-   rW  rR  cloner$   r   tritonnext_power_of_2r   r   r   r   r  rE   r[  r   r   r   r   r   )rb  r  r   r'   r*   nnzr   r   rf   rg   rm   r   r  r  s    `          @@r   r  r    s   '''FE5;///::<<1 2 2;;== {2331jjll$||~~3BCC8	9 033KK 0==K))++55a88@@BGG <<>>##B++99;; 	$\\^^))++FF\\^^FR$$Z\\Yq\\WQ^^WRC)O44 	 \!_ii@	 crc"MO	
	 	 	 	 	 	 	 	foy+FFFFNN2y#y99Yr2ellnn*, 	 -  &&((%%''<
 
 
 	
r   r  queryr  value	attn_mask	dropout_p	is_causalscalec           	          d}t          | | d           t          |d u| d           |J t          |j        t          j        k    | dt          j         d|j         d           t	          ||| j                   t	          ||| j                   t	          ||| j                   t          ||| j                   t          ||| j                   |j        t          j        urt          ||| j                   t          || |
                    dd          d	d
          }||                     d          dk    s|d	k    rt          d
| d| d           |*dt          j        |                     d                    z  n|}	|                                                    |	           t!          |          }t          j        j                            |                                |d           t)          ||          }|S )N_scaled_dot_product_attentionz'(): is_causal == True is not supported.z'(): attn_mask == None is not supported.z(): attn_mask.layout must be z, but got attn_mask.layout == r!   r"   rt   r  F)r  r`  r   z(): current value of scale == z results in division by zero.r   T)r   inplace)r   r   r   r   r   r   r3   r-   r  r  r   r   mathsqrtr   r}  r  nn
functionaldropoutr  )
r  r  r  r  r	  r
  r  r   sdpascale_factors
             r   r  r  M  s:    1)mOOOPPPit#%W%W%WXXX$$$ 00 7 7(-(87 7#,#37 7 7	
 	
 	
 	VS%,///VUEL111VY555FC---FE5;///?%*,,	5;777ucmmB33#5
 
 
 =UZZ^^q00ESLL / / / / /  
 9>q49UZZ^^44445<(((4  ##DKKMMY#MMMD%((r   r   r   r   r{  r   r   c                 4   | |z  }||z  }t          j        d          }t          j        d          }||z  }||z  }||z  t          j        d|          z   }||z  t          j        d|          z   }t          j        d|          } ||d d d f         |z  | d d d f         |z  z   z   }!|| d d d f         |	z  |d d d f         |
z  z   z   }"t          j        |||z  z             }#t          j        ||dz   |z  z             }$|#|$k    rd S t          j        ||f|          }%t          |#|$          D ]}&t          j        ||&|z  z             }'t          j        ||&|z  z   |z             }(t          j        |!|'|z  z             })t          j        |"|(|z  z             }*|%t          j        |)|*||          z  }%|||z  z   |d d d f         |z  |d d d f         |z  z   z   }+t          j        |+|%                    |j	        j
                             d S Nr   r  r   r  )r  rN  )rv  r  r8  r  r   rc   r  r  r  r-   r  ),r   r   r   
blocks_ptrblocks_stride_Pblocks_stride_Mblocks_stride_K
others_ptrothers_stride_Qothers_stride_Kothers_stride_Naccumulators_ptraccumulators_stride_Raccumulators_stride_Maccumulators_stride_Npq_offsets_ptrpq_offsets_stridepq_ptrpq_stride_Tpq_stride_1r{  r   r   rN  r   r   pid_tpidpid_mpid_nrmrnrkA_ptrB_ptrr   r   r  r   r   r   Ar   C_ptrs,                                               r   _scatter_mm2_kernelr4  {  st   6 &[&[1%%%m###r	bV^bi6222V^bi6222Yq!__qqq$wK/)BtQQQwK/,II
 qqq$wK/)BtQQQwK/,II
 W^e.?&??@@W^uqy4E&EEFF88FHff-]CCC	r2 	V 	VA[011A[0;>??AO 3344AO 3344A1*UUUUII ++, 111d733T111W+ 556 	 		%5%;%FGGHHHHHr   r   r   rK  
pq_indicesr   c                    | j         \  }}|j         \  }}|j         \  }	}}t          t          ddz            t          ddz            dd          }
fd}t          j        t
          j        t          j        t
          j        t          j        t
          j        t          j        t
          j        i|j	                 }d|
vr$|

                    |t
          j        k               t          |         || |                     d	          |                     d          |                     d          ||                    d	          |                    d          |                    d          ||                    d	          |                    d          |                    d                              d	          ||                    d	          |                    d          fd
|i|
 d S )Nr;   r   r   r   )r   r   r   r   c                     j         d         dz
  t          j        | d                   t          j        | d                   z  dfS )Nr   r   r   r   r$   r  cdiv)METAr   r   rK  s    r   rx   z_scatter_mm2.<locals>.grid  sH     #a'AtH~..QX1O1OO r   rN  rO  r   r{  )r$   r   r}   r   r   rv  rw  r/   r  r-   r   r4  rD   )r   r   rK  r5  r   r   r   r   r  r   r   rx   r{  r   r   s     `          @@r   r   r     s    ,1a,1a$1ar16??3r16??qTU
 
 
	 	 	 	 	 	 	 M2:NBJM2:M2:	

 
 t##KK=BJ#>K???D!MM!MM!MM!MM!MM!MM!""""""a  a  a  )	
 	
* (+	
, -	
 	
 	
 	
 	
r   r   rP  r   r   c                    ||z  }||z  }||z  }t          j        d          }t          j        d          }|| z  } || z  }!||z  }"||"z  }#|#|z  }$t          ||$z
  |          }%|$||%z  z   }&||"z  |%z  }'|&|z  t          j        d|          z   }(|'|z  t          j        d|          z   })t          j        d|          }*||(d d d f         |z  |*d d d f         |z  z   z   }+|| |	z  z   |*d d d f         |
z  |)d d d f         |z  z   z   },t          j        ||!z             }-|rU|-|z  |z  }.|-|z  |z  }/t          j        ||.z             }0t          j        ||.z   dz             }1|/|1z  ||/z
  |0z  z   }2|1|0z
  }3n6t          j        ||!z             }2t          j        ||!z   dz             }4|4|2z
  }3||2z   }5t          j        ||f|          }6|r|+|0|z  z  }+t          |3          D ]f}7t          j        |5          }8t          j        |,|8z             }9t          j        |+          }:|6t          j        |:|9||          z  }6|+|z  }+|5dz  }5gn||2z   };t          |3          D ]}7t          j        |5          }8t          j        |,|8z             }9t          j        |;          }<t          j        |+|<|z  z             }:|;dz  };|5dz  }5|6t          j        |:|9||          z  }6||-z   | |z  z   |(d d d f         |z  |)d d d f         |z  z   z   }=t          j        |=|6	                    |j
        j                             d S r  )rv  r  rC   r8  r  r   rc   r  r  r  r-   r  )>rA  r   r   r   r  r  r  r  r  others_stride_Br  r  r   accumulators_stride_Br"  r#  c_indices_ptrr_offsets_ptrp_offsets_ptrq_offsets_ptrrP  r{  r   r   r   r   rN  r   BLOCKS_MBLOCKS_Npid_t_r*  pid_br)  num_pid_in_groupgroup_idfirst_pid_mgroup_size_mr+  r,  r-  r.  r/  r0  r1  r   r'   r*   r   r   r   r  r   q_ptrr  r  r   r   r2  p_ptrr   r3  s>                                                                 r   _scatter_mm6_kernelrL    s	   < '\<<A&&&m###!("%0**+8k1:>>s\12''L8V^bi6222V^bi6222Yq"qqq$wK/)BtQQQwK/,II
 o%&!!!T'{_,r$'{_/LLN 	 GME)** 
	aBAQ2A*++B*Q.//BR7Q;",,Br'CC.//B.233Br'C"Hff-]CCC	 	R/))E3ZZ  GENNGEAI&&GENNRVqMj   	 (
 "B&E3ZZ 	 	GENNGEAI&&GENNGEA$7788

RVqMj   		
 ++, 111d733T111W+ 556	 	 		%5%;%FGGHHHHHr   Tr   r   r   r   force_contiguousc	                 @   |d         }	| j         \  }
}|j         \  }}|j         \  }}}||k    sJ ||	z  |k    sJ fd}t          j        t          j        t          j        t          j        t          j        t          j        t          j        t          j        i|j                 }d|vr$|                    |t          j        k               |	                    d          dk    sJ 	                    d          dk    sJ |	                    d          dk    sJ |	                    d          dk    sJ |rT| 
                                } |
                                }|                                s|
                                }n|}n|}t          |         ||| | 	                    d          | 	                    d          | 	                    d          ||	                    d          |	                    d          |	                    d          ||	                    d          |	                    d          |	                    d          |||fd|i| |r+|                                s|                    |           d S d S d S )	Nr   c                     j         d         z  t          j        | d                   t          j        | d                   z  fS )Nr   r   r   r8  )r:  r   r   r   r   s    r   rx   z_scatter_mm6.<locals>.grid  sD    "Q&BX//&+b$x.2Q2QQ r   rN  rO  r   r   r   r{  )r$   r   r   rv  rw  r/   r  r-   r   rD   rE   r  rL  rY  )r   r   r   r   r   r   r   r   rM  r   r   r   rT  r   B_r   r   rx   r{  accumulators_r   r   r   s      `                @@@r   r   r   l  s    y/L	2r<2q &	ArQwwww'\Qwwww	 	 	 	 	 	 	 	 M2:NBJM2:M2:	

 
 t##KK=BJ#>K???""a''''""a''''""a''''""a''''  	)&&((F&&((F--// - , 7 7 9 9 ,(MD!MM!MM!MM!MM!MM!MM!  ##  ##  ##)	
 	
* (+	
, -	
 	
 	
2  	.L$>$>$@$@ 	.}-----	. 	. 	. 	.r   rn  ro  rp  rr  c)                    t          j        d          })t          j        d          }*t          j        d          }+t          j        d          },t          j        d          }-t          j        |*|+|,|-|'          \  }*}+|||)z  z   ||*z  z   }.t          j        |.          }/t          j        |.|z             }0|0|/z
  }1t          j        d|"          }2t          j        d|$          }3t          j        d|#          }4| r4|||)z  z   ||*z  z   ||+z  z   ||2d d d f         z  z   ||4d d d f         z  z   }5| ||)z  z   ||/z  z   ||2d d d f         z  z   ||3d d d f         z  z   }6|||)z  z   ||+z  z   ||3d d d f         z  z   ||4d d d f         z  z   }7|||)z  z   ||*z  z   ||+z  z   ||2d d d f         z  z   ||4d d d f         z  z   }8||	|)z  z   |
|/z  z   }9| r6t          j        |5                              |%          }:|r|!s
||z  };|:|;z  }:nt          j        |"|#f|%          }:t          |1          D ]i}<t          j        |6          }=t          j        |9          }>t          j        |7||>z  z             }?|:t          j	        |=|?|&|%          z  }:|6|z  }6|9|
z  }9j|!s|:|z  }:t          j
        |8|:                    |j        j                             d S r  )rv  r  r  r  r  r8  r  r   rc   r  r  r-   r  )@r  r  r  r  r  r  r  r  r  r  r  	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_strider  r  r  r  r  r  r  r  r  r  r  r  r  r  rn  ro  rp  rq  rs  rr  rt  rN  r  r   r  r  r  r  r  r  r  r  r  r  inner_block_aranger  
input_ptrsr  r  r  r  r  
beta_alphar  r  r  r  s@                                                                   r   ru  ru    s   p Mq)))	1---1---A...A...')|=,n(
 (
$}
 ')34!M12 	 
 W455
'"9<O"OPP "J.9Q66Yq/::9Q66 		 $y01(=89 )=89 )+;AAAtG+DD	E
 )+;D!!!G+DDE  !I-.*,- &(8D(AAB &(:47(CC	D 	  9,-$}45 %'9!!!T''BBC %'7aaa'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8qqq(AAB 	 &23 :-. 	  	Y!wz2255i@@ /L /!E\
 J. !x(FiXXXw 	4 	4A7#455L G$566M' #9M#II K
 kjI! ! ! 
 !22!33 	&% 	.11*2B2MNNOOOOOr   r   )NNNNNN)NNNNNNr   )r   )NN)r  FN)T)Cr  osr  	functoolsr   typingr   r   r   torch._dynamo.utilsr   torch.utils._tritonr   _triton_ops_metar	   intgetenvr
   r   r   r   r+   r3   rA   rF   rQ   rV   r]   r_   ry   r   r   r   r   r   r   r   r  r  rL  rU  r^  Tensorr  r   re  r   r  triton.languagelanguagerv  jit	constexprr  r  r  r  r  r  r  r0   r  r4  r   rL  r   ru  r7   r   r   <module>ri     sd
    				        " " " " " " " "  ) ) ) ) ) ) * * * * * * & & & & & & .1SBI:A>>. . *
  
      "    (  &U U U      ; ; ;2& & & &0 7  7  7F	 	 	
/ 
/ 
/   >B m2 m2 m2 m2 m2l k k k kl 
G G G GTK K K K K K K K\ =>>>]
 ]
 ?>]
B  ;       FD D D DX 

"&MQ" " "<"	" <" 
%,	" " uXc]HSM8C=HIJ" 4." " " "T 

"&MQu u u<u	u <u 
%,	u u uXc]HSM8C=HIJu 4.u u u up :<< M%MMM      Z{4 l{4 |	{4
 |{4 {4> <?{4@ LA{4 {4 {4 Z{4z ZAPN |OAPP |QAPR <SAPT LUAPV WAP AP AP ZAPF3G 3G 3Gt &*!QUW W W|WlW lW el#W W 5#x}!LMNW W W Wz '+!QU#5J 5J 5J\5J|5J el#	5J
 5J 5#x}!LMN5J tn5J 5J 5J 5Jn ZV \V lV V V ZVpE
 E
 E
 E
X !%, ,|,\, |, EL)	,
 , , , , , ,\ ZEI<EI<EI <EI* |+EI, -EI. /EI0 L1EI EI EI ZEIN5
5
5
 L5
 L	5

 l5
 5
 5
 5
n ZoI LoI* |+oI, |-oI. /oI0 1oI2 3oI4 L5oI6 L7oI oI oI ZoIt "&X. X.X.X. <X. <	X.
 <X. <X. X. lX. X. X. X. X.t Z_PX \Y_PZ [_P\ l]_P^ |__P` |a_Pb c_Pd <e_Pf Lg_Ph i_Pj k_P _P _P Z_P _P _PD KLM$(!LL $r   