
     Ngh                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
Z
d dlZd dlZd dlmZmZ d dlmZmZ d dlmZmZmZmZmZmZ d dlmZ d dlmZmZmZ d dl m!Z! d dl"m#Z#m$Z$m%Z% d dl&Z' ej(        e)          Z*d	 Z+d
e j,        de-fdZ.d
e j,        fdZ/d Z0d Z1d Z2d Z3d Z4d Z5ddZ6d Z7e)dk    r e7             dS dS )    N)measure_memorysetup_logger)get_rankget_size)add_io_bindings_as_ortvalues%get_merged_sample_with_past_kv_inputsget_msft_sample_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)ORTModelForCausalLM)ProfilerActivityprofilerecord_function)trange)
AutoConfigAutoModelForCausalLMAutoTokenizerc                     | j         dv rdS | j         dk    r>	 t          |j                  S # t          $ r t          |j        j                  cY S w xY wt          |                                          S )N   hf-pt-eagerhf-pt-compiler   hf-ort)benchmark_typeleninputs_names	Exceptiondecoderinput_names
get_inputs)argsmodels     k/var/www/html/ai-engine/env/lib/python3.11/site-packages/onnxruntime/transformers/models/llama/benchmark.pyget_ort_model_inputs_lenr$   (   s    >>>qh&&	2u)*** 	2 	2 	2u}011111	2 u!!"""s   , #AAr!   ort_model_inputs_lenc                    d\  }}| j         dk    rdn| j        j        }| j         dv rXt          | j        | j        | j        | j        d          }t          | j        | j        | j        | j        | j        d          }n| j         dv r|d	k    rXt          | j        | j        | j        | j        d          }t          | j        | j        | j        | j        | j        d          }nwt          | j        | j        | j        | j        d
|| j        | j
        dd
  
        }t          | j        | j        | j        d| j        || j        | j
        dd
  
        }n| j         dk    r{t          | j        | j        | j        | j        d
|| j        | j
        dd| j                  }t          | j        | j        | j        d| j        || j        | j
        dd| j                  }n| j         dk    rg|dk    }t          | j        | j        d
| j        || j        | j
        |          }t          | j        | j        | j        d|| j        | j
        |          }nt          d          ||fS )NNNort-msfti   r   T)return_dict)use_fp16r)   >   r      r   pt)seq_lenpast_seq_lenmax_seq_lenr*   use_buffer_shareenginer)      ort-convert-to-onnxort)r-   r.   r/   r*   r0   r1   r)   
world_size   )r.   r-   r/   r*   r0   split_kvz/Unable to auto-detect inputs for provided model)r   configmax_position_embeddingsr
   target_device
batch_sizesequence_lengthr   r*   r   r0   r5   r	   r   )r!   r%   init_inputsiter_inputsr/   r7   s         r#   r    r    5   s   )K
 -;;$$AdK>>>'KO 
 
 
 5KO ]
 
 
 
	
	*	*1$$+"$   K 9"$   KK @",'!%!6   K @"!1'!%!6   KK 
	 5	5	5;KO(#]!2
 
 
 <KO-#]!2
 
 
 
	
	*	*'!+,KO(#]!2	
 	
 	
 -KO-#]!2	
 	
 	
 IJJJ##    c                    d\  }}d\  }}| j         dv r| j        r| j        n| j        }t          j                    }t	          j        || j        rt          j        nt          j	        | j
        | j
        d| j                                      | j                  }t          j                    }| j         dk    rt          j        |          }nU| j         dv r5t          j                    }| j        |_        | j        rd|_        d|_        nt-          d| j                    | j         d	k    rt/          | j                  t2          u r| j        d
         n| j        }t/          | j                  t2          u r| j        d         nd }d }d }	t5          j        | j                  D ])}
d|
vsd|
v sd|
v rd|
v s|
dk    r|
}d|
v r|
}	d|
v r|
}|
}	*t          j                    }t;          j        | j        ||	| j
        | j
        d|dk    rdnd |||
  
        }t          j                    }| j         dv rt<                              d| j         !                    | j"                              t          j                    }t          j#        | j         !                    | j"                  || j        g          }t          j                    }t<                              d||z
   d           |S )Nr'   r   T)torch_dtypeuse_auth_tokentrust_remote_code	use_cache	cache_dirr   >   r   r(   r3   r2   Cannot recognize r   r   z.onnxz
.onnx_dataz
.onnx.datadecoder_modelz
model.onnxdecoder_with_past_modeldecoder_merged_model)	decoder_file_namedecoder_with_past_file_namerB   rC   use_io_binding
use_mergedproviderprovider_optionssession_options   r(   r3   zLoading model from )	providerszLoaded model in  s)$r   hf_pt_dir_path
model_nametimer   from_pretrainedr*   torchfloat16float32authrE   tor:   compiler4   SessionOptionsr   enable_profilingverboselog_verbosity_levellog_severity_levelr   typeexecution_providertupleoslistdirhf_ort_dir_pathr   loggerinfoort_model_pathformatrankInferenceSession)r!   r"   sess_options
start_timeend_timesourcerN   rO   rJ   rK   filenames              r#   	get_modelrt      s0   $E<%J >>>(,(;P$$Y[[
$4)-IEM9"in
 
 
 "T
 
  	 9;;/11M%((E		 M	M	M)++(,%< 	0/0L,./L+ AD,?AABBBh&&15d6M1N1NRW1W1W4*1--]a]t9=d>U9V9VZ_9_9_42155ei &*#
4#788 		7 		7Hh&&,(*B*BlV^F^F^(**h,.F.F$,!(H44.6+%11$,!.6+Y[[
#3 /(C9"i 1\ A At-(
 
 
 9;;AAAQ$*=*D*DTY*O*OQQRRRY[[
$&&ty11./
 
 

 9;;
KK<8j#8<<<===Lr?   c                      j         dv rt           j                  n t           j        t          j        d          } j        r% ||          }t                              |            fd} fd}|D ]!} |              ||            |             "d} j         dv rt           j	                  n t           j	        t          j        d          }	|	D ]O} |             t          j
                    }
 ||            |             t          j
                    }|||
z
  z  }P j         dvrt                              d           | j	        z  } j        |z  } j        dk    rt                              d	 j                    t                              d
 j                    t                              d| d           t                              d| d           d S )NrQ   zWarm up)filedescc                  h    j         dk    r"j        dv rj                                        nfdS )NcpurQ   c                      j         dk    r<t          j                                        rt          j                                        nd S )Nry   c                      d S N kwargss    r#   <lambda>z=time_fn.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>       r?   devicerX   cudais_availablesynchronizer   r!   s    r#   r   z+time_fn.<locals>.<lambda>.<locals>.<lambda>  D    {e##
(?(?(A(A# J""$$$%% r?   )r   r   
io_bindingsynchronize_inputsr   s    r#   r   ztime_fn.<locals>.<lambda>  sM    ;%D$7;^$^$^ 	**,,,
 
 
 
 r?   c                  h    j         dk    r"j        dv rj                                        nfdS )Nry   rQ   c                      j         dk    r<t          j                                        rt          j                                        nd S )Nry   c                      d S r|   r}   r~   s    r#   r   z=time_fn.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>(  r   r?   r   r   s    r#   r   z+time_fn.<locals>.<lambda>.<locals>.<lambda>%  r   r?   )r   r   r   synchronize_outputsr   s    r#   r   ztime_fn.<locals>.<lambda>"  sM    ;%D$7;^$^$^ 	++---
 
 
 
 r?   r   	Benchmark zBatch Size: zSequence Length: z	Latency: rS   zThroughput: z tps)r   rangewarmup_runsr   sysstdoutr`   ri   rj   num_runsrV   r;   rm   r<   )r!   fninputswarmup_rangeoutputs
input_syncoutput_sync_
total_timebench_rangerp   rq   latency
throughputs   `             r#   time_fnr     s/    "EEE 	dD$3:IFFF  | "V**G   J   K   

6


 J "EEE 	dmDM
EEE 
  	, 	,
Y[[

6


9;;h++

 "EEEB4=(G7*JyA~~44?44555>(<>>???++++,,,3:333444
Fr?   c                 N   d| j          d| j         d| j                                         d| j         d| j         d|j                            dd           d| dt          j        	                                d}d }| j        dv rt          t          j        t          j        gdd          5 }t          d	          5   ||           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |                    d
                              | j        | j                  }t&          j                            | j        | d          }t/          |d          5 }|                    |           d d d            n# 1 swxY w Y   n ||           | d}|S )Nb_sr   -z%Y-%m-%d_%H:%M:%Sr   T)
activitiesrecord_shapesprofile_memorymodel_inferencer6   )group_by_stack_n)sort_by	row_limitz.logwz.json)r;   r<   r   lower	precisionr   __name__replacedatetimenowr   r   CPUCUDAr   key_averagestablept_filter_bypt_num_rowsrf   pathjoin
log_folderopenwrite)	r!   r   r   inputs_typeprefixrs   prof	prof_datafs	            r#   
profile_fnr   R  s    R  R  RD$8  R  R4;N;T;T;V;V  R  RY]Yg  R  Rjnju  R  Rxz  yD  yL  yL  MP  RU  yV  yV  R  R  Yd  R  R  go  gx  g|  g|  g~  g~  R  R  RFH>>>(,.>.CDTXim
 
 
 	 !233  6


              	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 %%q%99??HYeieu?vv	7<<FAA(C   	AGGI	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 	6


 ###OsH   0C/ CC/C	C/C	 C//C36C3'F		FFc                    t          j                    }t          j        |          }|                    d                       | j        dk    rHt                              d|                    d           t          j        d          z   d           t          j
                     t          j                                         t          | j        dk    fd	
           t           j                                         d S )Ng?)intervalr   zCPU usage: F)logical%ry   c                                  S r|   r}   )r   r   s   r#   r   zmeasure_fn.<locals>.<lambda>|  s    rr&zz r?   )is_gpufunc)rf   getpidpsutilProcesscpu_percentrm   ri   rj   	cpu_countgccollectrX   r   empty_cacher   r   r   r   flush)r!   r   r   pidprocesss    ``  r#   
measure_fnr   o  s    
)++CnS!!G%%%BvJJJyA~~i'"5"5t"5"D"DvGW`eGfGfGf"fiiijjj JLLL	J4;%/7I7I7I7I7IJJJJ Jr?   c                    fd}|}| j         dk    r ||            ||           | j        r(t          | ||d          }| j         dk    rvj        j                                        }t                              d| d|            t          j	        |t          j
                            | j        |                     t          | ||d          }| j         dk    rvj        j                                        }t                              d| d|            t          j	        |t          j
                            | j        |                     d S t                              d           t          | ||           t!          | ||           t                              d	           t          | ||           t!          | ||           d S )
Nc                      di | }|S )Nr}   r}   r   r   r"   s     r#   
get_logitsz$run_hf_inference.<locals>.get_logits  s    %//&//r?   r   promptr   	Renaming  to token7
Evaluating `model(inputs)` step to get past_key_values5
Evaluating `model(inputs)` step with past_key_values)r   r   r   r   sessionend_profilingri   warningrf   renamer   r   r   decoder_with_pastrj   r   r   )r!   r=   r>   r"   r   generate_fnnew_lognameold_lognames      `    r#   run_hf_inferencer     s       4 Ko--K   K   |  {KJJ(**-/==??KNNE{EEEEFFFIk27<<#M#MNNN {KII(**19GGIIKNNE{EEEEFFFIk27<<#M#MNNN KKJKKKD+{+++t[+...
KKHIIID+{+++t[+.....r?   c                      fd}fd}fd} j         dk    r|n|}i } j        r+ |||          \  }	}t           ||	d          }
                                }t                              d| d|
            t          j        |t          j        	                     j
        |
                     t                      |||          \  }}t           ||d          }
                                }t                              d| d|
            t          j        |t          j        	                     j
        |
                     d S t                              d	            |||          \  }	}t           ||	           t           ||	           t                              d
            |||          \  }}t           ||           t           ||           d S )Nc                     t          |           } j        dk    rHt          | j        t          j                  j        |          \  }}t          d|           ||fS | |fS )Nry   r   )r   r   r   intrm   r0   setattr)r   kv_cache_ortvaluesr   r!   r"   s      r#   prepare_ort_inputsz-run_ort_inference.<locals>.prepare_ort_inputs  s}    "5&11 ;%-Ivt{C	NND<QSe. .*J* D,
333111)))r?   c                 2                         |            d S r|   )run_with_iobinding)r   r"   s    r#   with_io_bindingz*run_ort_inference.<locals>.with_io_binding  s      ,,,,,r?   c                 4                         d |           }|S r|   )runr   s     r#   without_io_bindingz-run_ort_inference.<locals>.without_io_binding  s    ))D&))r?   ry   r   r   r   r   r   r   )r   r   r   r   ri   r   rf   r   r   r   r   rt   rj   r   r   )r!   r=   r>   r"   r   r   r   r   r   ort_init_inputsr   r   ort_iter_inputss   `  `         r#   run_ort_inferencer     sB   * * * * * *- - - - -    
 &*[E%9%9//?QK| .@.@N`.a.a++ {OXNN ))++A;AAKAABBB
	+rw||DO[IIJJJ $.@.@N`.a.a++ {OWMM ))++A;AAKAABBB
	+rw||DO[IIJJJ KKJKKK*<*<[J\*]*]'O'D+///t[/222
KKHIII*<*<[J\*]*]'O'D+///t[/22222r?   c                     | j         dv rt          | |||           d S | j         dv rt          | |||           d S t          d| j                    )N>   r   r   r   rQ   rF   )r   r   r   r   )r!   r=   r>   r"   s       r#   run_inferencer     sn    HHH{K?????		 C	C	C$[%@@@@@AD,?AABBBr?   c           	         t          j                    }|                    ddt          dg d           |                    ddt          dd	           |                    d
dddd           |                    dddt          dg dd           |                    dt          dd           |                    dt          dd           |                    dt          dd           |                    ddd !           |                    d"d#d$!           |                    d%d&t          t          j                                        rd'nd(g d)*           |                    d+d,t          d-.           |                    d/d0t          d1.           |                    d2d3t          d4.           |                    d5t          d6.           |                    d7t          d8.           |                    d9t          d:.           |                    d;dd<           |                    d=t          d>d?           |                    d@t          dAdB           |                    dCdd<           |                    dDt          t          j	        
                    dE          dF           |                    dGt          ddHdIJ           |                                }t          j                            |j                   t	          j        |j                   dK|j        v rit#          |dL|j                                         dM           |j        dNk    r|j        dO| if|_        n"|j        dPk    r|j        dO| if|_        d'|_        |j        dQk    r|j        s
J dR            |j        dSv r|j        s
J dT            |j                            dU          |_        |j                            dU          |_        |j        dVv s|j        dWk    r|j        d(k    rdndX|_        |j        r:t9          |j                  d:k    rt9          |j                  d:k    s
J dY            |S )ZNz-btz--benchmark-typeT)r   r   r   r(   r3   )rc   requiredchoicesz-mz--model-namez<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rc   r   helpz-az--authF
store_truez5Use Hugging Face authentication token to access model)defaultactionr   z-pz--precisionfp32)int4int8fp16r  zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r   rc   r  r   r   z--hf-pt-dir-pathr   zNPath to directory containing all PyTorch files (e.g. tokenizer, PyTorch model))rc   r  r   z--hf-ort-dir-pathzhPath to directory containing all ONNX files (e.g. tokenizer, decoder_merged, decoder, decoder_with_past)z--ort-model-pathzPath to ONNX modelz-bz--batch-sizesz1 2)r  z-sz--sequence-lengthsz32 64 128 256 512z-dz--devicer   ry   )ry   r   rocm)rc   r  r   z-idz--device-idr   )rc   r  z-wz--warmup-runsr6   z-nz
--num-runs
   z--seed   z--max-length    z--num-return-sequencesr2   z	--profile)r  r  z--pt-filter-byself_cpu_time_totalz"What to filter PyTorch profiler byz--pt-num-rowsi  z.Number of rows for PyTorch profiler to displayz	--verbosez--log-folder.zFolder to cache log filesz--cache-dirz./model_cachez-Cache dir where Hugging Face files are stored)rc   r   r  r   r4   rd   ExecutionProviderCUDAExecutionProvider	device_idROCMExecutionProviderr   z,Please specify a path to `--hf-ort-dir-path`rQ   z+Please specify a path to `--ort-model-path` >   r  r  r  r  zOPlease provide only one (batch_size, sequence_length) combination for profiling)argparseArgumentParseradd_argumentstrrX   r   r   r   rf   r   r   
parse_argsnprandomseedmanual_seedr   r   r   upperrd   rh   rk   batch_sizessplitsequence_lengthsr   r   r   )rm   parserr!   s      r#   get_argsr     sQ   $&&F

 
 
     K     hlAx    
 000t     ]	     w	     !	         
 #    
 *1133>'''     }3BBB
oCCCC
lbAAA
sA666 S"===
0sAFFF U<HHH
s,AHl     c4Fvwww
U<HHH
S"',,s:K:KRmnnn
<     D INN49	di    ###*t{/@/@/B/B,U,U,UVVV"&==='+'>d@S&TD##$(???'+'>d@S&TD# DK h&&#SS%SSSSAAA"QQ$QQQQ'--c22D 177<<D .$44469Q9QVZVaejVjVjqw 	N
 | ] !!Q&&3t/D+E+E+J+J+J\ ,K+J+J Kr?   c                     t                      } t                      }t          |           }t          |j                   t
                              |j                   dt          j	        j
        _        | |_        ||_        t          j        |j        |j        |j        |j                  }t'          j        |j        |j        |j        |j                  }|j        dk    r
d|j         n|j        }|j        dk    }t-          |d|           t-          |d|           t-          |d|           t-          |d	|           t/          |          }t1          ||          }|j        d
v rt5          j        |j                            |j                  d          }	t=          t?          d |	j         j!                            }
|otE          |
          dk    o
|j        dk    }t-          |d|           nt-          |dd           tG          j$        |j%        |j&                  D ]\  }}|j        dk    r!t
                              d| d| d           t-          |dtO          |                     t-          |dtO          |                     tQ          ||          \  }}tS          ||||           d S )NT)rE   rB   rC   ry   zcuda:r  	tokenizerr8   r:   r*   rQ   F)load_external_datac                     | j         dk    S )NGroupQueryAttention)op_type)nodes    r#   r   zmain.<locals>.<lambda>  s    T\=R-R r?   r   r0   z
Batch size = z and sequence length = z...r;   r<   )*r   r   r   r   r`   ri   rj   __dict__rX   backendscudnn	benchmarkrm   r5   r   rW   rU   rE   r[   r   r   r   r   rt   r$   r   onnx
load_modelrk   rl   listfiltergraphr'  r   	itertoolsproductr  r  r   r    r   )rm   r5   r!   r"  r8   r:   r*   r"   r%   
onnx_model	gqa_nodesr0   r;   r<   r=   r>   s                   r#   mainr5    s   ::DJD>>D
KK%)EN"DI DO-4>$)_c_h  I '4>$)_c_h  F ,0;%+?+?'DI'''T[M~'HD+y)))D(F###D/=111D*h''' dOOE3D%@@ AAA_T%8%?%?	%J%J_deee
 R RT^TdTijjkk	#SI(:St{e?S(*:;;;;(%000 (1'89I4K`'a'a = =#
O9>>KKa*aa_aaabbblC
OO444'_)=)=>>>#-d4H#I#I [dKe<<<<= =r?   __main__)r   )8r  r   r   r1  loggingrf   r   rV   numpyr  r,  r   rX   benchmark_helperr   r   dist_settingsr   r   llama_inputsr   r   r	   r
   r   r   optimum.onnxruntimer   torch.profilerr   r   r   tqdmr   transformersr   r   r   onnxruntimer4   	getLoggerr   ri   r$   	Namespacer   r    rt   r   r   r   r   r   r   r   r5  r}   r?   r#   <module>rC     s     				      				 



         9 9 9 9 9 9 9 9 , , , , , , , ,                4 3 3 3 3 3 E E E E E E E E E E       H H H H H H H H H H    		8	$	$
# 
# 
#$X' $s $ $ $ $DRH& R R R RjC C CL  :  &;/ ;/ ;/|83 83 83vC C CE E E EP/= /= /=d zDFFFFF r?   