
    çg)                         d dl Z d dlZd dlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZ  e j                    d             Zd Zd Zd	 Z	 ddZd ZdS )    N   )cdiv)driver)get_dram_gbpsget_max_simd_tflopsget_max_tensorcore_tflopsnvsmic                      	 t          dg          d         dz  S # t          $ rN dd l} |                                  |                     d          }|                     || j                  dz  cY S w xY w)Nzclocks.max.smr   g     @@)r	   FileNotFoundErrorpynvmlnvmlInitnvmlDeviceGetHandleByIndexnvmlDeviceGetMaxClockInfoNVML_CLOCK_SM)r   handles     X/var/www/html/ai-engine/env/lib/python3.11/site-packages/triton/ops/matmul_perf_model.pyget_clock_rate_in_khzr      s    To&''*S00 T T T22155//8LMMPSSSSSTs    AA32A3c                     |t          |d          z  }t          j        j                            |           d         dz  }t          ||          |z  t          |t                      |           z  }|S z# return compute throughput in TOPS    multiprocessor_count)minr   activeutilsget_device_propertiesr   r   devicenum_ctas	num_warpsdtypetotal_warpsnum_subcorestflopss          r   get_tensorcore_tflopsr$      ss    SA...K=&<<VDDE[\_``L{++l:=V$&&>0 >0 0FM    c                     |t          |d          z  }t          j        j                            |           d         dz  }t          ||          |z  t          |t                      |           z  }|S r   )r   r   r   r   r   r   r   r   s          r   get_simd_tflopsr'       sn    SA...K=&<<VDDE[\_``L{++l:=PQVXmXoXoqw=x=xxFMr%   c                     t           j                            |           }|d         dk     r"|t           j        k    rt	          | |||          S t          | |||          S )Nr      )torchcudaget_device_capabilityfloat32r'   r$   )r   r   r   r    
capabilitys        r   
get_tflopsr/   (   sZ    11&99J!}qUem33vxEBBB 9eDDDr%   Fc                 (   t           j                                        }|j        }|                                }t          ||          }t          ||	          }|}||z  |z  }t          ||          t          ||	          }}d|z  |z  |z  dz  }t          ||| |          }||z  }t          j	        j
                            |          d         }t          d||z            }t          d|dz            }t          t          d|dz
  dz            d          }t          |          |dz  |d	z  z   z  }|d
z  }||z  |z  dd|dz
  z  z   z  }||z  |z  dz  |dz
  z  }||z  |z  dd|dz
  z  z   z  } ||z  |z  dz  |dz
  z  }!|| z   dz  }"||!z   dz  }#|"|z  |#|z  z   }$|dz  }%||z  |z  |z  dz  }&|dk    r|&|%z  }'n|%}(|&|(z  }'||z  dz  dz  |%z  })|'|)z  }'t          ||$          |'z   }*|r"t          d|* d| d|$ d|' d|dz   d           |*S )zO return estimated running time in ms
          = max(compute, loading) + store r   i   @r          L   r   gffffff?g?r   g?g?i   g333333?zTotal time: zms, compute time: zms, loading time: zms, store time: zms, Activate CTAs: d   %)r*   r+   current_devicer    element_sizer   maxr/   r   r   r   r   r   r   print)+r   
num_stagesABCMNKBLOCK_MBLOCK_NBLOCK_KSPLIT_Kdebugkwargsr   r    dtsize	num_cta_m	num_cta_n	num_cta_kr   	total_opstput
compute_msnum_smactive_cta_ratioactive_cta_ratio_bw1active_cta_ratio_bw2dram_bwl2_bwload_a_dram	load_a_l2load_b_dram	load_b_l2
total_dramtotal_l2load_msstore_bwstore_c_dramstore_ms	reduce_bwzero_mstotal_time_mss+                                              r   estimate_matmul_timera   /   s    Z&&((FGE^^FQ  IQ  II9$y0H q'??C7OOqA A	A!34Ifh	599DT!J ] 66v>>?UVF1h/00q(R-00s1x"}&BCCQGGF##';d'BEY\`E`'`aGaKEa%&.Ay1}(=$=>KA$	A6Ia%&.Ay1}(=$=>KA$	A6I+<JI%+6H7"X%55G }Hq56>G+{;L!||(*	)+a%!){+h6G
G,,x7M 9 8] 8 8j 8 8&8 88@8 8 0 48 8 8 	9 	9 	9 r%   c                 ~   t           j                                        }t           j                                        }|d                                         }|d         j        }g }| D ]~}|j        }	|	d         |	d         |	d         |j        f\  }
}}}t          j	        j
                            |          d         }|
|z   |z  |z  |z  }||k    r|                    |           |} |t           j        t           j        fvrd | D             } i }| D ]k}|j        }	|	d         |	d         |	d         |	d         |j        |j        f\  }
}}}}}|
||||f}||v r||                             ||f           c||fg||<   lg }|                                D ]\  }}|\  }
}}}}|d         d	k    rc|
|z  |z  d
z  }|t#          d|          z  d	z  }d}||z  t%          j        d|fd          }|D ]}|                    |d                    ||d         d         }d|_        |                    |           |S )Nr;   rA   rB   rC   max_shared_memc                 4    g | ]}|j         d          dk    |S )rD   r1   )rF   ).0configs     r   
<listcomp>z&early_config_prune.<locals>.<listcomp>   s)    QQQf6=3Kq3P3P63P3P3Pr%   rD   r   r)   i   r   i,  r   c                 n    | d         z
  dk     rdt          | d         z
            z   n
| d         z
  S )Nr1   r   
   )abs)xoptimal_num_stagess    r   <lambda>z$early_config_prune.<locals>.<lambda>   sI    aD--22 %'QqT4F-F)G)G$G$G89!?Q8Q r%   )key)r*   r+   r6   r,   r7   r    rF   r:   r   r   r   r   appendfloat16r-   r   itemsr   heapq	nsmallest)configs
named_argsrF   r   r.   rG   r    pruned_configsrf   kwrA   rB   rC   r:   max_shared_memoryrequired_shared_memoryconfigs_maprD   r   rn   kvmmas
mma_cyclesldgsts_latencynearestnrandom_configrl   s                               @r   early_config_pruner   p   s   Z&&((F1133J_))++FsO!E N * *]yM2i="Y-9JJ 	.': #M/EEfMMN^_")G"3w!>!Kf!T!%666!!&)))G U]EM222QQQQQ K 	6 	6]yM2i="Y-IHXZ`Zkk 	B'7Iz '9=+##VZ$89999!' 45KN!!## 1 11895'7Ia=AW$w.+>DAy 1 11A5J N!/*!< o1 R R R RS S SG  , ,%%ad++++, aDGM'(M$!!-0000r%   )F)	functoolsrr   r*    r   runtimer   testingr   r   r   r	   	lru_cacher   r$   r'   r/   ra   r    r%   r   <module>r      s                      \ \ \ \ \ \ \ \ \ \ \ \ T T T    E E E > > > >B; ; ; ; ;r%   