
     Ng                        d Z ddlZddlZddlZddlZddlmZ ddlZddlZddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlmZ dd	l m!Z!  ej"        d
          Z# ej$        d          Z$dej%        vr e&e$          ej%        d<   ddl'Z'ddl(m)Z)m*Z*m+Z+ d Z,d Z-de.de.fdZ/d Z0d Z1d Z2e3dk    r e2             dS dS )a]   Benchmarking the inference of pretrained transformer models.
    PyTorch/TorchScript benchmark is based on https://github.com/huggingface/transformers/blob/master/examples/benchmarks.py.
    One difference is that random input_ids is generated in this benchmark.

    For onnxruntime, this script will convert a pretrained model to ONNX, and optimize it when -o parameter is used.

    Example commands:
        Export all models to ONNX, optimize and validate them:
            python benchmark.py -b 0 -o -v -i 1 2 3
        Run OnnxRuntime on GPU for all models:
            python benchmark.py -g
        Run OnnxRuntime on GPU for all models with fp32 optimization:
            python benchmark.py -g -o
        Run OnnxRuntime on GPU with fp16 optimization:
            python benchmark.py -g -o -p "fp16"
        Run TorchScript on GPU for all models:
            python benchmark.py -e torchscript -g
        Run TorchScript on GPU for all models with fp16:
            python benchmark.py -e torchscript -g -p "fp16"
        Run ONNXRuntime and TorchScript on CPU for all models with quantization:
            python benchmark.py -e torchscript onnxruntime -p "int8" -o
        Run OnnxRuntime with the ROCM provider and graph optimization script:
            python benchmark.py -g -m bert-base-cased --provider rocm --optimizer_info by_script --disable_embed_layer_norm
        Run OnnxRuntime with bfloat16 fastmath mode kernels on aarch64 platforms with bfloat16 support:
            python benchmark.py --enable_arm64_bfloat16_fastmath_mlas_gemm

    It is recommended to use run_benchmark.sh to launch benchmark.
    N)datetime)ConfigModifierOptimizerInfo	Precisioncreate_onnxruntime_sessionget_latency_resultinference_ortinference_ort_with_io_bindingoutput_detailsoutput_fusion_statisticsoutput_summarysetup_logger)FusionOptions)MODEL_CLASSESMODELS)create_onnxruntime_inputexport_onnx_model_from_ptexport_onnx_model_from_tfload_pretrained_model)version)QuantizeHelper F)logicalOMP_NUM_THREADS)
AutoConfigAutoTokenizerLxmertConfigc                 	   dd l }g }| r^d|                                vrHd|                                vr2d|                                vrt                              d           |S d}|dk    r@t          j        }d}d|                                vrt                              d	           |S |t          j        k    rt                              d
| d           |D ]}t          |         d         }|
D ]}|t          |          k    r n|d |         }t          |         d         |_	        t          j        |          }d|v rt          j                    5  t          |t          |         d         t          |         d         t          |         d         |||||| |||||||          \  }} }!}"d d d            n# 1 swxY w Y   d|v rWt          |t          |         d         t          |         d         t          |         d         |||||| |||||||          \  }} }!}"| s4t!          || |d|||          }#|#Nd |#                                D             }$g }%| rdnd}&t%          j        ||          }'t)          j        t-          |          t-          |          t-          |!|'j                  g          }(t)          j        t-          |          |'j        g          })|D ]}*|*dk    r
|D ]}+|"|+|"k    rd|v rt(          j        nt(          j        },t5          |!|*|+||'|,          }-d|j        ||&||| ||||*|+|                                t;          t=          j                              d}.|'j	        dv r/t                               d| d|*d|'j!        |'j!        g            n"t                               d| d|*|+g            |rtE          |#|-|.|	|*|          }/n|##                    |$|-          }0|(g}1tI          t          |0                    D ]J}2|2dk    r-t          |         d         dk    r|1%                    |)           5|1%                    |(           Kd|v rt(          j&        nt(          j'        }3tQ          |#|-|.|	|$|0|%|1|*|&|3|          }/t                               |/           |%                    |/           ې|S )Nr   CUDAExecutionProviderROCMExecutionProviderDmlExecutionProviderzPlease install onnxruntime-gpu or onnxruntime-directml package instead of onnxruntime, and use a machine with GPU for testing gpu performance.tensorrt   TensorrtExecutionProviderzhPlease install onnxruntime-gpu-tensorrt package, and use a machine with GPU for testing gpu performance.zOptimizerInfo is set to zA, graph optimizations specified in FusionOptions are not applied.   pt      tfT)enable_all_optimizationnum_threadsverbose(enable_mlas_gemm_fastmath_arm64_bfloat16c                     g | ]	}|j         
S  )name).0node_args     ^/var/www/html/ai-engine/env/lib/python3.11/site-packages/onnxruntime/transformers/benchmark.py
<listcomp>z#run_onnxruntime.<locals>.<listcomp>   s    XXX(XXX    cudacpu	cache_dironnxruntimeenginer   	providersdevice	optimizer	precision
io_binding
model_nameinputsthreads
batch_sizesequence_lengthcustom_layer_numr   vitswinzRun onnxruntime on  with input shape gpt))r:   get_available_providersloggererrorr   NOOPTwarningr   len
model_typer   parsetorchno_gradr   r   r   get_outputsr   from_pretrainednumpyprodmaxhidden_sizeint64int32r   __version__get_layer_numstrr   nowinfo
image_sizer	   runrangeappendlonglongintcr
   )4use_gpuprovidermodel_namesmodel_classconfig_modifierr@   r+   batch_sizessequence_lengthsrepeat_timesinput_countsoptimizer_infovalidate_onnxr9   onnx_dirr,   	overwritedisable_ort_io_bindinguse_raw_attention_maskmodel_fusion_statisticsmodel_source(enable_arm64_bfloat16_fastmath_mlas_gemmargsr:   resultswarm_up_repeatrB   all_input_names
num_inputsinput_namesfusion_optionsonnx_model_fileis_valid_onnx_model
vocab_sizemax_sequence_lengthort_sessionort_output_namesoutput_buffersr>   configmax_last_state_sizemax_pooler_sizerE   rF   input_value_type
ort_inputsresult_templateresultort_outputsoutput_buffer_max_sizesi	data_types4                                                       r3   run_onnxruntimer   Y   ss   2 G	$K,O,O,Q,QQQ$K,O,O,Q,QQQ#;+N+N+P+PPP ]	
 	
 	
 N:&,&k.Q.Q.S.SSSLLz   N,,,x~xxx	
 	
 	
 " e+ e+
 ,Q/& c	+ c	+JC0000)+:+6K$Z03DO*066N|##]__   2"z*1-z*1-z*1-#'! #!&%.!/&# '+"+              2 |## .:&q):&q):&q)#"!*+"# #''* ' 4(,'9a  K "XXk>U>U>W>WXXXN&1VVEF/
iPPPF"'*$$())
F$677# # $j#k*:*:F<N)OPPO) L+ L+
??'7 I+ I+O*6?M`;`;` 6:l6J6Ju{{PUP[$!9""'#(" "J #0#.#:%-"(%3%.*@&@&0",#.&0+:,;,I,I,K,K$'$7$7' 'O" (O;; H*  H  HQ[]^`f`qsy  tE  QF  H  H    $w*$w$wYcetXu$w$wxxx- "!.'&+(&*" " '2oo6F
&S&S3F2G/!&s;'7'7!8!8 T TA Avv&*<Q*?5*H*H 7 > > O O O O 7 > >?R S S S S6:l6J6JENNPUPZ	!>'&+(,'*3&"%*" " KK'''NN6****SI+L+\ Ns   AGGGc                    g }| r:t           j                                        st                              d           |S t          j        d           |D ]}t          j        ||	|          }|                    |           t          ||||          }|j
        dv r
|d         g}n1t          j        ||          }|j                            |d          }t                              d	|            t                              d
|                                            |t           j        k    r|                                 t          j        | rdnd          }|                    |           |t           j        k    rt-          j        |          }|D ]F}|dk    r
|D ]8}|j
        dv rzt                              d| d|d|j        |j        g            t          j        |d|j        |j        f|t           j        k    rt           j        nt           j        |          nX|||k    rt                              d| d||g            t          j        d|j        dz
  ||ft           j        |          	 |	r t           j         !                    |          n|
rt          j"        |          n|            tG          j$        fd|d          }|	rdn|
rdndt           j%        d| rdndd|d|d||||&                                tO          tQ          j)                              d}|*                    tW          ||                     t                              |           |,                    |           # tZ          $ rC}t          .                    |           t           j        /                                 Y d }~2d }~ww xY wH|S )NzYPlease install PyTorch with Cuda, and use a machine with GPU for testing gpu performance.F)torchscriptr9   )r   r9   custom_model_classrH   r   r8      zModel zNumber of parameters zcuda:0r7   zRun PyTorch on rK   r%   )sizedtyper>   r'   )lowhighr   r   r>   c                                  S Nr/   )	inference	input_idss   r3   <lambda>zrun_pytorch.<locals>.<lambda>  s    YYy5I5I r5   repeatnumberr   torch2rU   NAr6   r   r;   )0rU   r6   is_availablerN   rO   set_grad_enabledr   rX   modifyr   rS   r   max_model_input_sizesgetdebugnum_parametersr   FLOAT16halfr>   toINT8r   quantize_torch_modelrc   rd   randnfloat16float32randintr   longjittracecompiletimeitr   r_   r`   ra   r   rb   updater   rg   RuntimeError	exceptionempty_cache)rj   rl   rm   rn   r@   r+   ro   rp   rq   r   r   r9   r,   r}   rB   r   model	tokenizermax_input_sizer>   rE   rF   runtimesr   er   r   s                            @@r3   run_pytorchr   9  se    G uz..00 pqqq	5!!!! U- U-
+JK[deeev&&&%*	
 
 
 // 0 34%5jIVVVI&<@@TRRN%e%%&&&EU-A-A-C-CEEFFF	)))JJLLL'<hhu==	&&"7>>E% 7	- 7	-JQ#3 3- 3-$77KK*UVX^Xikqk|H}   !&(!V->@QR/8I<M/M/MemmSXS`%! ! !II &1o6V6V KK o* o oQ[]lPm o oppp %#.2(/:#j%! ! !I-=Hw	y999flNwem\aNbNbNbrw  Ii(((%}-I-I-I-I-IR^ghiiiH 4?"c--PVDcHH\c#(#4%),3">&&%'%.&(&0"##.&0+:,;,I,I,K,K$'$7$7 F  MM"4Xz"J"JKKKKK'''NN6****# - - -$$Q'''J**,,,,,,,,-c3-	7	-r Ns   D	N
O#	 8O	O#	do_eager_modeuse_xlac                 2     ddl m dd l fd}|S )Nr   )wrapsc                                   fd            }                                            fd                        }du rdu s
J d            |S |S )Nc                       | i |S r   r/   r|   kwargsfuncs     r3   run_in_eager_modezFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_eager_mode  s    4((((r5   )experimental_compilec                       | i |S r   r/   r   s     r3   run_in_graph_modezFrun_with_tf_optimizations.<locals>.run_func.<locals>.run_in_graph_mode  s     4((((r5   TFzcCannot run model in XLA, if `args.eager_mode` is set to `True`. Please set `args.eager_mode=False`.)function)r   r   r   r   r)   r   r   s   `  r3   run_funcz+run_with_tf_optimizations.<locals>.run_func  s    	t	) 	) 	) 	) 
	) 
t	'	2	2	) 	) 	) 	) 
3	2 
	) D  5   t !  $$$$r5   )	functoolsr   
tensorflow)r   r   r   r)   r   s   `` @@r3   run_with_tf_optimizationsr     sS    % % % % % % % %$ Or5   c                    ! g }dd l !!j        j                            |           | s!j                            g d           | r5!j                                        st                              d           |S | r!j        	                    d          }	 !j                            |d         d           !j        j
                            |d         d           !j                            d           n1# t          $ r$}t                              |           Y d }~nd }~ww xY w|t           j        k    s|t           j        k    rt'          d          |D ]}t)          j        ||	          |                               t/          ||	|d	          t1          j        ||	          }|j                            |d
          }|D ]/}|dk    r
|D ]!}|||k    rt                              d| d||g            dd l}|                                  fdt=          ||z            D             }!                    |||f!j                   	 tC          dd          fd            }tC          dd          fd            }tC          dd          !fd            }|j"        r|ntG          tH                    r|              tK          j&        fd|d          }d!j'        d| rdndd|d|d||||(                                tS          tU          j+                              d}|,                    t[          ||                     t                              |           |.                    |           # t          $ rS}t                              |           ddl/m0} |1                                }|2                                 Y d }~d }~ww xY w1|S )Nr   GPUzVPlease install Tensorflow-gpu, and use a machine with GPU for testing gpu performance.Tz/gpu:0)r>   z+Mixed precision is currently not supported.r8   )r   r9   r   is_tf_modelr   zRun Tensorflow on rK   c                 L    g | ] }                     d j        dz
            !S )r   r'   )r   r   )r1   r   r   rngs     r3   r4   z"run_tensorflow.<locals>.<listcomp>  s/    mmmA#++a):Q)>??mmmr5   )shaper   F)r   r   c                        d          S )NF)trainingr/   r   r   s   r3   encoder_forwardz'run_tensorflow.<locals>.encoder_forward  s    $uY????r5   c                          d          S )NF)decoder_input_idsr   r/   r   s   r3   encoder_decoder_forwardz/run_tensorflow.<locals>.encoder_decoder_forward  s    $uY)V[\\\\r5   c                      j                             ddj        g          } j                             ddj        g          } | |d          S )Nr'   F)visual_feats
visual_posr   )randomnormalvisual_feat_dimvisual_pos_dim)featsposr   r   r   r)   s     r3   lxmert_forwardz&run_tensorflow.<locals>.lxmert_forward  sf     "	 0 0!Q8N1O P P i..1f6K/LMM$u%).'*%*	      r5   c                                    S r   r/   )r   s   r3   r   z run_tensorflow.<locals>.<lambda>&  s    YY[[ r5   r'   r   r   r   r6   r7   r   r;   )r6   )3r   r   	threading set_intra_op_parallelism_threadsset_visible_devicestestis_built_with_cudarN   rO   list_physical_devicesexperimentalset_memory_growth
distributeOneDeviceStrategyr   r   r   r   r   NotImplementedErrorr   rX   r   r   r   r   r   rc   r   Randomrf   constantr^   r   is_encoder_decoder
isinstancer   r   r   r_   r`   ra   r   rb   r   r   rg   numbar6   get_current_devicereset)"rj   rl   rm   rn   r@   r+   ro   rp   rq   r9   r,   r}   physical_devicesr   rB   r   r   rE   rF   r   valuesr   r   r   r   r   r6   r>   r   r   r   r   r   r)   s"                               @@@@@@r3   run_tensorflowr    s    GI88EEE 1
	%%b%000 rw1133 mnnn  9::5AA	 I))*:1*=uEEEI"445Ea5H$OOOM++8+<<<< 	  	  	 Q	  I%%%in)D)D!"OPPP! W# W#
+J)LLLv&&&%*
 
 
 "1*	RRR	"8<<ZNN% G	# G	#JQ#3 C# C#!-/N2R2RnnnPZ\kOlnnooommoommmmmz\kOkIlIlmmmKKz?6S[][cKdd	7#.UERRR@ @ @ @ @ SR@ /UERRR] ] ] ] ] SR] /UERRR       SR !0I0 3$;		#FL99 3$2	IKKK%}-@-@-@-@^_```H #/#%>%),3">&&%'%.&(&0"##.&0+:,;,I,I,K,K$'$7$7 F  MM"4Xz"J"JKKKKK'''NN6****# # # #$$Q'''******!4466FLLNNNNNNNN#}C#	G	#R Ns3   A"D   
D.
D))D.(D<N&&
P	0AO>	>P	c                  N   t          j                    } |                     ddddt          g dt	          t          j                              dd                    t          j                              z              |                     d	dd
t          dddgd           |                     ddt          d t	          t                    dd                    t                    z              |                     ddddt          dgg dd           |                     dddt          t          j
                            dd          d           |                     ddt          t          j
                            dd          d           |                     dd dd!d"#           |                     d$dt          d d%           |                     d&d't          t          j        t	          t                    d()           |                     d*dd!d+#           |                     d,dd!d-#           |                     d.d/t          t          j        t	          t                    d0)           |                     d1d2dd!d3#           |                     d4d5dd d67           |                     d8d9dd d:7           |                     d;d<dd d=7           |                     d>d?ddd
gt          g d@dAB           |                     dCdDddEt          dFG           |                     dHdIdt          d
gJ           |                     dKdLdt          g dMJ           |                     dNdd!dO#           |                     dP           |                     dQdRddt          dSgdTU           |                     dVdt          d dW           |                     dXdd!dY#           |                     dZ           t#          j        |            |                                 }|S )[Nz-mz--modelsF+)zbert-base-casedzroberta-basegpt2z Pre-trained models in the list: z, )requirednargstypedefaultchoiceshelpz--model_sourcer'   r&   r)   zExport onnx from pt or tfz--model_classz!Model type selected in the list: )r	  r  r  r  r  z-ez	--enginesr:   )r:   rU   r   r   r   zEngines to benchmarkz-cz--cache_dir.cache_modelsz%Directory to cache pre-trained models)r	  r  r  r  z
--onnx_dironnx_modelszDirectory to store onnx modelsz-gz	--use_gpu
store_truezRun on gpu device)r	  actionr  z
--providerzExecution provider to usez-pz--precisionzfPrecision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization)r  r  r  r  z	--verbosezPrint more informationz--overwritezOverwrite existing modelsz-oz--optimizer_infozjOptimizer info: Use optimizer.py to optimize onnx model as default. Can also choose from by_ort and no_optz-vz--validate_onnxzValidate ONNX modelz-fz--fusion_csvz:CSV file for saving summary results of graph optimization.)r	  r  r  z-dz--detail_csvz#CSV file for saving detail results.z-rz--result_csvz$CSV file for saving summary results.z-iz--input_counts)r'   r(   r%   zXNumber of ONNX model inputs. Please use 1 for fair comparison with Torch or TorchScript.)r	  r
  r  r  r  r  z-tz--test_timesd   z8Number of repeat times to get average inference latency.)r	  r  r  r  z-bz--batch_sizes)r
  r  r  z-sz--sequence_lengths)             @         z--disable_ort_io_bindingz=Disable running ONNX Runtime with binded inputs and outputs. )rw   z-nz--num_threadsr   zThreads to use)r	  r
  r  r  r  z--force_num_layersz%Manually set the model's layer numberz*--enable_arm64_bfloat16_fastmath_mlas_gemmzHEnable bfloat16 mlas gemm kernels on aarch64. Supported only for CPU EP )r{   )argparseArgumentParseradd_argumentra   listr   keysjoinr   ospathr   FLOAT32r   BYSCRIPTintset_defaultsr   add_arguments
parse_args)parserr|   s     r3   parse_argumentsr+  E  sI   $&&F
;;;V[]]##/$))FKMM2J2JJ  	 	 	 t(     ]##0499]3K3KK     OOO#  	 	 	 S.114     S-00-     kE,Uhiii
(     !Yu     eLOghhh
(	     &]##y     "     I     2     3     		g  	 	 	 G     oSsQCPPP
,,,     "L	     u555
     4     4W	     GGG'''DKr5   c                  
   t                      } t          | j                   | j        t          j        k    r#| j        st                              d           d S | j        t          j	        k    r,| j        r%| j
        dvrt                              d           d S t          | j                  dk    r(t          | j        d                  d         dv rdg| _        t          d	 | j        D                       | _        t                              d
|             t$          j                            | j                  sK	 t%          j        | j                   n0# t.          $ r# t                              d| j                   Y nw xY wd| j        v }d| j        v }d| j        v }d| j        v }d| j        v }|r]t3          j        t6          j                  t3          j        d          k     r)t                              dt6          j                    d S t;          | j                  }g }| j        D ]h}t7          j        |           t                               t6          j!        "                                           |s|s|r| j#        dgk    rt          $                    d           |rK|tK          | j        | j        | j&        || j        || j'        | j        | j(        dd| j        | j                  z  }|rK|tK          | j        | j        | j&        || j        || j'        | j        | j(        dd| j        | j                  z  }|rK|tK          | j        | j        | j&        || j        || j'        | j        | j(        dd| j        | j                  z  }|rI|tS          | j        | j        | j&        || j        || j'        | j        | j(        | j        | j                  z  }i }	|r	 | j*         }
|tW          | j        | j
        | j        | j&        || j        || j'        | j        | j(        | j#        | j,        | j-        | j        | j.        | j        | j/        | j0        |
|	| j1        | j2        |           z  }=# tf          $ r t          4                    d           Y dw xY wjtk          j6                    7                    d          }|	r| j8        pd| d}ts          |	|           t          |          dk    r(| j'        dgk    rt          $                    d           d S | j:        pd| d}tw          ||           | j<        pd| d}t{          |||            d S )Nzfp16 is for GPU only)migraphxrocmzint8 is for CPU onlyr'   r   r%   )rI   swimr   c                 ,    h | ]}|d k    rt           n|S )r   )	cpu_count)r1   xs     r3   	<setcomp>zmain.<locals>.<setcomp>  s$    TTTaAFFyyTTTr5   zArguments: z#Creation of the directory %s failedrU   r   r   r:   r   z2.0.0z2PyTorch version must be >=2.0.0 and you are using zB--input_counts is not implemented for torch or torchscript engine.TF	Exceptionz%Y%m%d-%H%M%Sbenchmark_fusion_z.csvzNo any result available.benchmark_detail_benchmark_summary_)>r+  r   r,   r@   r   r   rj   rN   rO   r   rk   rR   modelsr   rp   sortedr+   rc   r"  r#  existsr9   mkdirOSErrorenginesr   rT   rU   r_   r   force_num_layersset_num_threadsr   
__config__parallel_inforr   rQ   r   rm   ro   
test_timesr  use_mask_indexr   rs   rt   ru   rv   rw   rz   r{   r4  r   r   rb   strftime
fusion_csvr   
detail_csvr   
result_csvr   )r|   enable_torchenable_torch2enable_torchscriptenable_onnxruntimeenable_tensorflowrn   r}   r+   ry   rx   
time_stampcsv_filenames                r3   mainrO    s   D~***4<*+,,,~''DL'T]Rf=f=f+,,,
4;1A!7!:o!M!M!#TT4CSTTTUUD
KK$d$$%%%7>>$.)) P	PHT^$$$$ 	P 	P 	PLL>OOOOO	P dl*L,M&$,6&$,6$4 u'899GM'<R<RRR]%J[]]^^^$T%:;;OG' g. g.k***U%3355666 5	= 5	,> 5	 QC''cddd! ;LK$#N$)ONL     ;LK$#N$)ONL     ;LK$#N$)ONL     	~  %  G #% 	..-1-@)@&?LMK$#N$)O%'&NMLN/*+%A/  2  . . .  -----.9	.> ((99J HN*Nj*N*N*N !8,GGG
7||qs""NN5666?J&J*&J&J&JL7L)))?K&K:&K&K&KL7L$/////s%   E *FF%B
Q11$RR__main__)4__doc__r  loggingr"  r   r   rY   psutilbenchmark_helperr   r   r   r   r   r	   r
   r   r   r   r   r   r   huggingface_modelsr   r   onnx_exporterr   r   r   r   	packagingr   quantize_helperr   	getLoggerrN   r1  environra   rU   transformersr   r   r   r   r   boolr   r  r+  rO  __name__r/   r5   r3   <module>r^     s    :   				                                   ) ( ( ( ( ( 4 4 4 4 4 4 4 4                  * * * * * *		2		FU+++	 BJ&&$'C	NNBJ !  @ @ @ @ @ @ @ @ @ @] ] ]@m m m`T D    4  DE E EP_0 _0 _0D zDFFFFF r5   