
     Ng                       d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZmZmZ ddlZddlZddlZddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z. ddl/m0Z0 ddl1m2Z3 ddl4m5Z5m6Z6  ej7        d          Z8 G d de          Z9d^deee:                  dej;        fdZ<dej;        fdZ=dej;        fdZ>d_de:de?fdZ@d_de:de?de?fdZAde:de?d e?de)fd!ZBd"ej        d#efd$ZCd"ej        d#efd%ZDd"ej        d#efd&ZE	 	 	 	 d`d)ed*ed+e:d,eFd-eeG         d.eeG         fd/ZHd0ed1efd2ZI	 dad"ed,eFdee         fd3ZJd4 ZKd5 ZLd6 ZMd7efd8ZNd7ed9e?d:e?de?fd;ZOd7efd<ZP	 dbd?ed@e:dAeFdBeFdCeFf
dDZQd7efdEZRd7efdFZSdGefdHZTd_dIe:de?fdJZU	 d_dIe:dKe:de?de?fdLZVdM ZWe9jX        fdej;        dNe9fdOZYdej;        d?ee e%f         dPejZ        dQejZ        dReFdSeFdTeeeF                  dee:ef         fdUZ[dV Z\dcdej;        dXeee:                  dYe?fdZZ]d^dej;        dXeee:                  fd[Z^dddeee:                  dXeee:                  fd\Z-e_d]k    r e-             dS dS )ea  
This converts GPT2 or T5 model to onnx with beam search operator.

Example 1: convert gpt2 model with beam search:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx

Example 2: convert gpt2 model with beam search containing specific cuda optimizations:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu                       --past_present_share_buffer --use_decoder_masked_attention

Example 3: convert gpt2 model with beam search with mixed precision and enable SkipLayerNorm strict mode:
    python convert_generation.py -m gpt2 --output gpt2_beam_search.onnx --use_gpu -p fp16 --use_sln_strict_mode

Example 4: convert T5 model with beam search in two steps:
    cd ./models/t5
    python convert_to_onnx.py -m t5-small
    cd ../..
    python convert_generation.py -m t5-small --model_type t5                                            --decoder_onnx ./models/t5/onnx_models/t5-small_decoder.onnx                                    --encoder_decoder_init_onnx ./models/t5/onnx_models/t5-small_encoder_decoder_init.onnx          --output ./models/t5/onnx_models/t5_small_beam_search.onnx

Example 5: convert T5 model with beam search. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx

Example 6: convert T5 model with beam search containing specific cuda optimizations. All in one step:
    python convert_generation.py -m t5-small --model_type t5 --output ./models/t5/onnx_models/t5_small_beam_search.onnx           --use_gpu --past_present_share_buffer --use_decoder_masked_attention

Example 7: convert MT5 model with external data file like mt5-base-beamsearch.onnx.data in below example.
    python convert_generation.py -m google/mt5-base --model_type mt5 --output mt5-base-beamsearch.onnx -e

Example 8: convert gpt2 model with greedy search:
    python convert_generation.py -m gpt2 --output gpt2_greedy_search.onnx --num_beams 1 --num_return_sequences 1

Example 9: convert gpt2 model with sampling:
    python convert_generation.py -m gpt2 --output gpt2_sampling.onnx --num_beams 1 --num_return_sequences 1 --top_p 0.6
    N)Enum)Path)AnyDictListOptionalUnion)	Precisionsetup_logger)NumpyHelper)
GraphProto
ModelProtoTensorProto)	OnnxModel)
GPT2ConfigGPT2LMHeadModelGPT2Tokenizer	MT5ConfigMT5ForConditionalGenerationT5ConfigT5ForConditionalGenerationT5Tokenizer)GraphOptimizationLevelInferenceSessionSessionOptionsget_available_providers)main)PRETRAINED_GPT2_MODELS)export_onnx_models)PRETRAINED_MT5_MODELSPRETRAINED_T5_MODELS c                        e Zd ZdZdZdZd ZdS )GenerationTypebeam_searchgreedy_searchsamplingc                     | j         S N)value)selfs    g/var/www/html/ai-engine/env/lib/python3.11/site-packages/onnxruntime/transformers/convert_generation.py__str__zGenerationType.__str__U   s
    z    N)__name__
__module____qualname__
BEAMSEARCHGREEDYSEARCHSAMPLINGr-    r.   r,   r$   r$   P   s2        J"LH    r.   r$   argvreturnc                 r   t          j                    }|                    d          }|                    dddt          dd                    t          t          z   t          z             z              |                    dd	t          d
g ddd                    g d          z              |                    dd	t          t          j
                            dd          d           |                    dd	t          dd           |                    dd	t          dd           |                    dd	dd           |                    d	           |                    d          }|                    ddt          d           |                    d d!d	t          t          j        t          j        t          j        gd"           |                    d#d$d	d%d&gd'(           |                    d)d*d	dd+           |                    d	,           |                    d-d.d	dd/           |                    d	0           |                    d1d2d	dd3           |                    d	4           |                    d5d6d	dd7           |                    d	8           |                    d9d:d	dd;           |                    d	<           |                    d=          }|                    d>d	dd?           |                    d	@           |                    dAd	ddB           |                    d	C           |                    dDd	dE           |                    d	F           |                    dGt          d	dHdIJ           |                    dKd	ddL           |                    d	M           |                    dNd	ddO           |                    d	P           |                    dQd	ddR           |                    d	S           |                    dTd	ddU           |                    d	V           |                    dWd	ddX           |                    d	Y           |                    dZd	dd[           |                    d	\           |                    d]d	dd^           |                    d	_           |                    d`          }|                    dat          d	dbdcJ           |                    ddt          d	dedfJ           |                    dgt          d	dhdiJ           |                    djt          d	dbdkJ           |                    dlt           d	dbdmJ           |                    dnt           d	dbdoJ           |                    dpt           d	dqdrJ           |                    dst           d	dqdtJ           |                    dut           d	t!          dv           dwJ           |                    dxt          d	dbdyJ           |                    dzt           d	d{d|J           |                    d}t          d	dHd~J           |                    dt          d	ddJ           |                    dt          d	ddJ           |                    dt          d	ddJ           |                    d          }|                    dd	dd           |                    d	           |                    dd	dd           |                    d	           |                    dd	dd           |                    d	           |                    dd	dd           |                    d	           |                    dd	dd           |                    d	           |                    dd	t          dbd           |                    dd	dd           |                    d	           |                    |           }|S )zParse arguments

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.

    Returns:
        argparse.Namespace: Parsed arguments.
    zInput optionsz-m--model_name_or_pathTzEPytorch model checkpoint path, or pretrained model name in the list: , )requiredtypehelpz--model_typeFgpt2)r>   t5mt5z*Model type (default is gpt2) in the list: )r;   r<   defaultchoicesr=   --cache_dir.cache_modelsz%Directory to cache pre-trained models)r;   r<   rA   r=   z--decoder_onnxr"   zLPath of onnx model for decoder. Specify it when you have exported the model.z--encoder_decoder_init_onnxzgPath of ONNX model for encoder and decoder initialization. Specify it when you have exported the model.z	--verbose
store_truezPrint more information)r;   actionr=   )verbosezOutput options--outputz,Output path for onnx model with beam search.z-p--precisionzTPrecision of model to run. fp32 for full precision, fp16 for half or mixed precisionz-b--op_block_list*autozDisable certain onnx operators when exporting model to onnx format. When using defaultvalue for gpt2 type of model fp16 precision, it will be set to ["Add", "LayerNormalization", "SkipLayerNormalization", "FastGelu"]. Other situation, it will be set to [])r;   nargsrA   r=   z-e--use_external_data_formatz!save external data for model > 2G)use_external_data_formatz-sz--run_shape_inferencezrun shape inference)run_shape_inferencez-dpvsz--disable_pad_vocab_sizezDo not pad logits MatMul weight to be a multiple of 8 along the dimension where dim value is the vocab size. The logits MatMul may hence be of poor performance for fp16 precision.)disable_pad_vocab_sizez-dsgdz,--disable_separate_gpt2_decoder_for_init_runzDo not create separate decoder subgraphs for initial and remaining runs. This does not allow for optimizations based on sequence lengths in each subgraph)*disable_separate_gpt2_decoder_for_init_runz-iz--disable_shared_initializerszdo not share initializers in encoder and decoder for T5 or in the init decoder and decoder for GPT2. It will increase memory usage of t5/mt5/gpt2 models.)disable_shared_initializersz6Beam search parameters that stored in the output modelz--output_sequences_scoreszoutput sequences scores)output_sequences_scoresz--output_token_scoreszoutput token scores)output_token_scoresz--early_stopping)r;   rG   )early_stoppingz--no_repeat_ngram_sizer   zNo repeat ngram size)r<   r;   rA   r=   z--vocab_maskz\Enable vocab_mask. This mask applies only to every generated token to filter some bad words.)
vocab_maskz--past_present_share_bufferzWUse shared buffer for past and present, currently work for gpt2 greedy/sampling search.)past_present_share_bufferz--use_decoder_masked_attentionzUses `DecoderMaskedSelfAttention` or `DecoderMaskedMultiHeadAttention` to optimize the decoding Attention computation. Must be used with `past_present_share_buffer`. Currently, only Attention head sizes of 32, 64 and 128 are supported.)use_decoder_masked_attentionz--prefix_vocab_maskzeEnable prefix_vocab_mask. This mask can be used to filter bad words in the first generated token only)prefix_vocab_maskz--custom_attention_maskz]Enable custom_attention_mask. This mask can be used to replace default encoder attention mask)custom_attention_maskz--presence_maskz!Presence mask for custom sampling)presence_maskz--seedzRandom seed for sampling op)seedzYBeam search parameters not stored in the output model, for testing parity and performancez--min_length   zMin sequence lengthz--max_length2   zMax sequence lengthz--num_beams   z	Beam sizez--num_return_sequencesz&Number of return sequence <= num_beamsz--length_penaltyz<Positive. >1 to penalize and <1 to encourage short sentence.z--repetition_penaltyz-Positive. >1 to penalize and <1 to encourage.z--temperature      ?z6The value used to module the next token probabilities.z--top_pzTop P for samplingz--filter_valueInfzFilter value for Top P samplingz--min_tokens_to_keepzAMinimum number of tokens we keep per batch example in the output.z--presence_penalty        z%presence penalty for custom sampling.z--customz&If 1 customized top P logic is appliedz--vocab_sizezIVocab_size of the underlying model used to decide the shape of vocab maskz--eos_token_idzKcustom eos_token_id for generating model with existing onnx encoder/decoderz--pad_token_idzKcustom pad_token_id for generating model with existing onnx encoder/decoderz0Other options for testing parity and performancez--use_sln_strict_modez_Enable strict mode for SLN in CUDA provider. This ensures a better accuracy but will be slower.)use_sln_strict_mode	--use_gpuz)use GPU for inference. Required for fp16.)use_gpuz--disable_parityzdo not run parity test)disable_parityz--disable_perf_testzdo not run perf test)disable_perf_testz--torch_performanceztest PyTorch performance)torch_performancez--total_runsz4Number of times of inference for latency measurementz--save_test_dataz-save test data for onnxruntime_perf_test tool)save_test_data)argparseArgumentParseradd_argument_groupadd_argumentstrjoinr   r!   r    ospathset_defaultsr
   FLOAT32FLOAT16intfloat
parse_args)r6   parserinput_groupoutput_groupmodel_groupbeam_parameters_group
test_groupargss           r,   parse_argumentsr   Y   s    $&&F++O<<KT
))*-AADYY
Z
Z[     %%%9DIIF[F[F[<\<\\     S.114     [     %v     %	     &&&,,-=>>L;	     !"I$56c     X  	 	 	 $0     u===%lQf     %888"b     U;;;6G     OOO'E     %@@@++,deeK#&	     U;;;"	     777/%UUUE222 #     k	     ...%f	     u===(	     %@@@t	     u555!l	     59990	     5111*	     %((("55c  &&~C%YZav&www&&~C%Y[bw&xxx&&}3XY`k&lll&& 5 '    &&K '    &&< '    &&E '    &&! '    &&u. '    &&P '    &&4 '    &&5 '    &&X '    &&Z '    &&Z '    **+]^^Jn	     666eL?j     E***%	     5111#	     e444'	     e444C     <	     5111T""DKr.   r   c                 Z   | j         }d|d| j        dd| j        t          j        k    rdndddd	d
dg}| j        r|                    d| j        g           | j        r|                    d           | j	        r|                    d           t          | j                  r0|                    dg           |                    | j                   | j        t          j        k    r| j        s
J d            | j        rt                              d|            t!          |           dS )zqConvert GPT-2 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r9   rI   z--optimize_onnxrJ   fp32fp16z--test_runs1z--test_cases10z--overwriterC   rg   rO   rK   zEfp16 or mixed precision model cannot run in CPU. Please add --use_gpuzarguments for convert_to_onnx:)r6   N)model_name_or_pathdecoder_onnx	precisionr
   rv   	cache_dirextendrh   appendrP   lenop_block_listrw   rH   loggerinfoconvert_gpt2_to_onnx)r   
model_name	argumentss      r,   gpt2_to_onnxr     s\    (J 	.I$5556I ~ :-8999| &%%%$ 75666
4 -+,---+,,,~***|dddddd
 | B@Y@@AAAi((((((r.   c                    t          | j        | j        t          | j                  j        | j        | j        | j        t          j
        k    | j        dddddd| j                  }t                              d|d                     t                              d|d                     |d         | _        |d         | _        dS )	znConvert T5 model to onnx

    Args:
        args (argparse.Namespace): arguments parsed from command line
    FT)rh   rP   optimize_onnxr   rH   use_decoder_start_tokenmerge_encoder_and_decoder_init	overwritedisable_auto_mixed_precisionuse_int32_inputs
model_typezonnx model for encoder: r   zonnx model for decoder: r_   N)export_t5_onnx_modelsr   r   r   outputparentrh   rP   r   r
   rw   r   r   debugencoder_decoder_init_onnxr   )r   pathss     r,   
t5_to_onnxr     s     "T[ !%!>~)::. %'+%*?  E" LL6E!H66777
LL6E!H66777%*1XD"aDr.   T	onnx_pathrP   c                     ddl m} t          j        | d          }|                    |dd          }|rt          j        || |           d	S t                              d           d	S )
zShape inference on an onnx file, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    r   )SymbolicShapeInferenceTload_external_dataF)
auto_mergeguess_output_ranksave_as_external_dataz4Failed to run symbolic shape inference on the model.N)	&onnxruntime.tools.symbolic_shape_inferr   onnx
load_modelinfer_shapesr   saver   warning)r   rP   r   modelouts        r,   shape_inferencer   (  s     NMMMMMOI$???E
 
-
-eX]
-
^
^C
 OsI=UVVVVVVMNNNNNr.   c                 ~   t          j        | d          }|j        j        d         j        }t          |          }|                                }||v sJ ||         }|j        dk    rdS d}|                    |j	        d                   }|A|
                    |dd          }	|	dS |                    |	j	        d                   }|dS d}|j        t          j        j        k    rdS t          |j                  d	k    rdS |j        d         }
|
d
z  dk    rdS t#          j        |
d
z            d
z  }||
z
  }|j        r|rbt)          j        |j        d         |ft(          j                  }t)          j        t1          j        |          |fd          }||j        d<   nat)          j        ||j        d         ft(          j                  }t)          j        t1          j        |          |fd          }||j        d<   |                                |_        ndS t          j        || |           dS )zPad the logits MatMul weight in the provided decoder model, which will be overwritten.

    Args:
        onnx_path (str): Path of onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   MatMulFr_   N	Transpose      dtypeaxisr   )r   r   graphr   namer   output_name_to_nodeop_typeget_initializerinputmatch_parent	data_typer   DataTyperw   r   dimsmathceilraw_datanpzerosfloat16concatenater   to_arraytobytesr   )r   rP   decoder_model_protologits_output_namedecoder_modelr   matmul_nodepad_along_axis_1logits_weighttranspose_before_matmulactual_vocab_sizepadded_vocab_sizepaddingpadding_dataweight_with_paddings                  r,   pad_weights_of_logits_matmulr   :  s}    /)MMM,29!<A122M';;==!44444%&89Kh&&u
 !11+2CA2FGGM"/"<"<[+WX"Y"Y"*5%556M6STU6VWW 5  +"6">>>u =!##u &*1-A!##t	"3a"7881<"33G   	68]%7%:G$DBJWWWL"$.+2F}2U2UWc1dkl"m"m"m$5Mq!!8Wm.@.C$DBJWWWL"$.+2F}2U2UWc1dkl"m"m"m$5Mq!!4!<!<!>!>u N&	Iabbbb4r.   
model_pathrh   rf   c                 "   t                      }t          j        |_        |rddgndg}|rQdt	                      vrt          d          t                              d           |rddi}d|ifd|D             }t          | ||          }|S )	a  Create OnnxRuntime session.

    Args:
        model_path (str): onnx model path
        use_gpu (bool): use GPU or not
        use_sln_strict_mode (bool): use strict mode for skip layer normalization or not

    Raises:
        RuntimeError: CUDAExecutionProvider is not available when --use_gpu is specified.

    Returns:
        onnxruntime.InferenceSession: The created session.
    CUDAExecutionProviderCPUExecutionProviderz5CUDAExecutionProvider is not available for --use_gpu!zuse CUDAExecutionProvider"enable_skip_layer_norm_strict_modeTc                 0    g | ]}|v r
||         fn|S r5   r5   ).0r   provider_optionss     r,   
<listcomp>z&create_ort_session.<locals>.<listcomp>  sB     # # #Y]$:J2J2J'-..PT# # #r.   )	providers)	r   r   ORT_DISABLE_ALLgraph_optimization_levelr   RuntimeErrorr   r   r   )r   rh   rf   sess_optionsexecution_providerscuda_provider_optionsort_sessionr   s          @r,   create_ort_sessionr     s     "##L,B,RL)OVt24JKK]s\t 
"*A*C*CCCVWWWKK3444 	%I4$P! 79NO# # # #at# # # #:|GZ[[[Kr.   r   r   c           
      j   |t           j        k    }t          | j                  }|dz
  }|dk    sJ g dd t	          |          D             z   }t          | j                  t          |          k    r4t          dt          |           dt          | j                             t          |          D ]\  }}| j        |         j        |k    r(t          d| d| d| j        |         j                   t          j	        }|dk    r|rt          j        nt          j
        }| j        |         j        j        j        }	|	|k    rt          d| d	| d|	           t                              d
           dgd t	          |          D             z   }
t          | j                  t          |
          k    r4t          dt          |
           dt          | j                             t          |
          D ]\  }}| j        |         j        |k    r(t          d| d| d| j        |         j                   |rt          j        nt          j
        }| j        |         j        j        j        }||k    rt          d| d	| d|           t                              d           dS )a  Verify GPT-2 subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of GPT-2
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
       r_   )	input_idsposition_idsattention_maskc                     g | ]}d | S )past_r5   r   is     r,   r   z(verify_gpt2_subgraph.<locals>.<listcomp>  s     HqHqHqYZQRHqHqHqr.    Number of inputs expected to be . Got Input  is expected to be $ is expected to have onnx data type z:Verifying GPT-2 graph inputs: name and data type are good.logitsc                     g | ]}d | S )present_r5   r   s     r,   r   z(verify_gpt2_subgraph.<locals>.<listcomp>  s    $P$P$P^^^$P$P$Pr.   !Number of outputs expected to be Output z;Verifying GPT-2 graph outputs: name and data type are good.N)r
   rw   r   r   range
ValueError	enumerater   r   INT32FLOATr<   tensor_type	elem_typer   r   r   )r   r   
is_float16input_countlayer_countexpected_inputsr   expected_inputexpected_type
input_typeexpected_outputsexpected_outputoutput_types                r,   verify_gpt2_subgraphr    s    i//Jek""K/K!EEEHqHq^cdo^p^pHqHqHqqO
5;3////jC<P<PjjX[\a\gXhXhjjkkk&77 
p 
p>;q>.00gaggNggRWR]^_R`Regghhh#)663=TK//;CTM[^(4>
&&nann]nnblnnooo '
KKLMMM z$P$PU;=O=O$P$P$PP
5<C 01111mSAQ=R=RmmZ]^c^jZkZkmmnnn'(899 q q?<??22jqjj_jjTYT`abTcThjjkkk/9P++{?Pl1o*6@-''oaoo]oobmooppp (
KKMNNN Fr.   c           
         |t           j        k    }|rt          j        nt          j        }t	          | j                  }|dz
  dz  }|dk    sJ ddg}t          |          D ]2}|                    d|            |                    d|            3t          |          D ]2}|                    d|            |                    d	|            3t	          | j                  t	          |          k    r4t          d
t	          |           dt	          | j                             t          |          D ]\  }}| j        |         j
        |k    r(t          d| d| d| j        |         j
                   |dk     rt          j        n|}	| j        |         j        j        j        }
|
|	k    rt          d| d|	 d|
           dg}t          |          D ]2}|                    d|            |                    d|            3t	          | j                  t	          |          k    r4t          dt	          |           dt	          | j                             t          |          D ]}\  }}| j        |         j
        |k    r(t          d| d| d| j        |         j
                   | j        |         j        j        j        }||k    rt          d| d| d|           ~dS )  Verify T5 decoder subgraph

    Args:
        graph (onnx.GraphProto): onnx graph of T5 decoder
        precision (Precision): Precision (FLOAT16 or FLOAT32) of the model.

    Raises:
        ValueError: Number of inputs not expected.
        ValueError: Input name is not expected.
        ValueError: Input data type is not expected.
        ValueError: Number of outputs not expected.
        ValueError: Output name is not expected.
        ValueError: Output data type is not expected.
    r   ra   r_   r   encoder_attention_maskpast_key_self_past_value_self_past_key_cross_past_value_cross_r   r   r   r   r   r   present_key_self_present_value_self_r  r  N)r
   rw   r   r	  r   r   r  r   r  r  r   r  r<   r
  r  r   )r   r   r  
float_typer  r  r  r   r  r  r  r  r  r  s                 r,   verify_t5_decoder_subgraphr!    s    i//J(2I$$8IJek""K?q(K! #$<=O; 7 73334445!556666; 8 844455561667777
5;3////jC<P<PjjX[\a\gXhXhjjkkk&77 p p>;q>.00gaggNggRWR]^_R`Regghhh-.UU))
[^(4>
&&nann]nnblnnooo ' !z; ; ; 7A 7 7888 9a 9 9::::
5<C 01111mSAQ=R=RmmZ]^c^jZkZkmmnnn'(899 o o?<??22jqjj_jjTYT`abTcThjjkkkl1o*6@*$$mqmmjmm`kmmnnn %	o or.   c           
         |t           j        k    }t          | j                  dz
  dz  }|dk    sJ g d}t          | j                  t          |          k    r4t          dt          |           dt          | j                             t          |          D ]\  }}| j        |         j        |k    r(t          d| d| d| j        |         j                   t          j	        }| j        |         j
        j        j        }||k    rt          d| d	| d|           d
dg}	t          |          D ]2}|	                    d|            |	                    d|            3t          |          D ]2}|	                    d|            |	                    d|            3t          | j                  t          |	          k    r4t          dt          |	           dt          | j                             t          |	          D ]\  }}
| j        |         j        |
k    r(t          d| d|
 d| j        |         j                   |rt          j        nt          j        }| j        |         j
        j        j        }||k    rt          d| d	| d|           t                               d           dS )r  r   ra   r_   )encoder_input_idsr  decoder_input_idsr   r   r   r   r   r   encoder_hidden_statesr  r  present_key_cross_present_value_cross_r  r  zMT5 encoder graph verified: name and data type of inputs and outputs are good.N)r
   rw   r   r   r   r  r  r   r   r  r<   r
  r  r  r   r	  r   r   )r   r   r  r  r  r   r  r  r  r  r  r  s               r,   'verify_t5_encoder_decoder_init_subgraphr(  -  s@    i//Ju|$$q(Q.K! [ZZO
5;3////jC<P<PjjX[\a\gXhXhjjkkk&77 p p>;q>.00gaggNggRWR]^_R`Regghhh#)[^(4>
&&nann]nnblnnooo ' !"9:; ; ; 7A 7 7888 9a 9 9::::; < < 8Q 8 8999 :q : :;;;;
5<C 01111mSAQ=R=RmmZ]^c^jZkZkmmnnn'(899 r r?<??22jqjj_jjTYT`abTcThjjkkk/9P++{?Pl1o*6@-''pqppmppcnppqqq ( KK_`````r.   shared_   graph1graph2shared_prefixmin_elementssignature_cache1signature_cache2c                     i }i }g }g }	g }
| j         D ]}|j        rt          |j                  |k    s"|j         D ]}|j        rt          |j                  |k    s"t          j        ||||          rp||j        z   ||j        <   |                    |           |j        |vr>||j        z   }|||j        <   |	                    |           |
                    |            nt                              d|
            | j	        D ]R}t          t          |j                            D ].}|j        |         |
v rt          d|j        |                    /S|j	        D ]R}t          t          |j                            D ].}|j        |         |
v rt          d|j        |                    /S|	D ]}|j                             |           |j        D ]}|j        |v r||j                 |_        |j	        D ]}t          t          |j                            D ]d}|j        |         |v rS||j        |                  }t                              d|j         d| d|j        |          d|            ||j        |<   e|D ]}| j                             |           | j        D ]}|j        |v r||j                 |_        | j	        D ]}t          t          |j                            D ]d}|j        |         |v rS||j        |                  }t                              d|j         d| d|j        |          d|            ||j        |<   e|	D ]}||j                 |_        |	D ]}t           j                            |          j        }t           j                            |j        |j        |          }| j                            |           |j                            |           |	S )	a  Remove initializers with same value from two graphs.

    Args:
        graph1 (GraphProto): the first graph to process
        graph2 (GraphProto): the second graph to process
        shared_prefix (str): add prefix to the shared initializers among two graphs
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.
        signature_cache1 (dict): Optional dictionary to store data signatures of tensors in graph1 in order to speed up comparison
        signature_cache2 (dict): Optional dictionary to store data signatures of tensors in graph2 in order to speed up comparison
    zshared initializers:zname is found in graph 1: zname is found in graph 2: zgraph 2 rename node z input z from z to zgraph 1 rename node )initializerr   sumr   has_same_valuer   r   r   r   noder  r   r   r   remove
value_infor   numpy_helperr   shapehelpermake_tensor_value_infor   )r+  r,  r-  r.  r/  r0  mapping_initializers_1mapping_initializers_2shared_initializers_1shared_initializers_2shared_initializers_namesinitializer1initializer2shared_namer5  jr2  r7  new_namer9  s                       r,   remove_shared_initializersrF  q  s   &   "*  ! 	c,*;&<&<&L&L". 	 	L % #l.?*@*@L*P*P'lDTVfgg 	<ILL]<]&|'89%,,\:::$,BBB"/,2C"CK@K*<+<=)00>>>-44[AAA	 LLC(ACCDDD  Q Qs4:'' 	Q 	QAz!} 999"#O
1#O#OPPP :	Q
  Q Qs4:'' 	Q 	QAz!} 999"#O
1#O#OPPP :	Q
 - / /!!+.... ' F F
?4444Z_EJO  ) )s4:'' 	) 	)Az!} 6661$*Q-@lDIllalltzZ[}llbjllmmm (
1		) - / /!!+.... ' F F
?4444Z_EJO  ) )s4:'' 	) 	)Az!} 6661$*Q-@lDIllalltzZ[}llbjllmmm (
1		) - D D1+2BC, - -!**;77=[778H+J_afgg
  ,,,  ,,,,  r.   encoder_modelr   c                 B   t          |           }t          |          }|                    d           |                    d           i i }}|                    |           |                    |           t          |j        j        |j        j        d||          }|S )Ne_d_s_)r-  r/  r0  )r   add_prefix_to_namesremove_duplicated_initializerrF  r   r   )rG  r   encoderdecoderr/  r0  initializerss          r,   get_shared_initializersrQ    s    &&G&&G%%%%%%)+R&))*:;;;))*:;;;-))  L r.   c                    g }| j         D ]7}|j        rt          |j                  |k    s"|                    |           8|D ]}| j                             |           |D ]k}t
          j                            |          j        }t
          j	        
                    |j        |j        |          }| j                            |           l|S )a^  Remove initializers of a graph, when they have number of elements larger than a threshold.

    Args:
        graph (GraphProto): the graph.
        min_elements (int, optional): minimal number of elements for initializers to be considered. Defaults to 1024.

    Returns:
        List[TensorProto]: initializers that are removed from the graph.
    )r2  r   r3  r   r6  r   r8  r   r9  r:  r;  r   r   r7  )r   r.  moved_initializerstensorr2  r9  r7  s          r,   move_initializersrU    s     # * * 	FK 0 0L @ @!!&))))) . .  ---- * , ,!**;77=[778H+J_afgg

++++r.   c                    | j         dk    rt          d| j         d          | j         dk    r| j        }n| j         dk    r| j        }n| j         dk    r| j        }n| j         dk    r| j        }n| j         dk    r| j        }n| j         d	k    r| j        }nl| j         d
k    r| j	        }nY| j         dk    r| j
        }nF| j         dk    r| j        }n3| j         dk    r| j        }n t          d| j         d| j          d          | j        |fS )z
    Convert attribute to kwarg format for use with onnx.helper.make_node.
        :parameter attribute: attribute in AttributeProto format.
        :return: attribute in {key: value} format.
    r   z
attribute z does not have type specified.r_   r   r   ra            r   	   
   z has unsupported type rD   )r<   r  r   fr   stgfloatsintsstringstensorsgraphs)	attributer*   s     r,   _attribute_to_pairrf    sB    ~TinTTTUUU ~	1			1			1			1			1		 	1			1		!	1		!	2		 ]in]]IN]]]^^^NE""r.   c                     i }| j         D ]+}t          |          \  }}|                    ||i           ,| j        r|                    d| j        i           |S )Ndomain)re  rf  updaterh  )r5  kwargsattrkeyr*   s        r,   	kwargs_ofrm  1  sj    F $ $)$//esEl####{ /x-...Mr.   c                 \    t          d | j        j        j        j        D                       S )Nc                 8    g | ]}|j         r|j         n|j        S r5   )	dim_param	dim_value)r   ds     r,   r   zshape_of.<locals>.<listcomp><  s'    gggA!+?!++AKgggr.   )tupler<   r
  r9  dim)vis    r,   shape_ofrv  ;  s*    ggI\IbIfggghhhr.   subgc                    d}d}g }t          | j                  D ]\  }}||k    rbt          |          }t          j                            |j        |j        j        j	        |d         |d         |d         d|d         g          }|
                    |g           |
                    t          j                            dt          j        j        dg	          g           |                     d
           | j        
                    |           g }t          | j                  D ]\  }}||k    rbt          |          }t          j                            |j        |j        j        j	        |d         |d         |d         d|d         g          }|
                    |g           |                     d           | j        
                    |           g }| j        D ]}	|	j        dk    rt#          |	          }
|
                    ddi           g }|
                    |	j                   t'          |          dk     r)|
                    dg           t'          |          dk     )t'          |          dk     r|
                    dg           t          j        j        d||	j        fd|	j        i|
}	|
                    |	g           |                     d           | j        
                    |           | S )Nr   r_   r   r   max_seq_lenra   r  r9  past_sequence_lengthr9  r   r   	AttentionrY   rX  r"   rY  r   r5  )r  r   rv  r   r:  r;  r   r<   r
  r  r   r   r  
ClearFieldr   r5  r   rm  ri  r   	make_node)rw  input_past_0output_past_0
new_inputsr   ru  r9  new_outputs	new_nodesr5  rj  niss               r,   1update_decoder_subgraph_past_present_share_bufferr  ?  s   LMJ4:&&    2RLLE33'-7Qxq58]E!HM 4  B
 	2$t{99:PRVRbRhqrps9ttuvvvOOGJj!!!K4;'' ! !2RLLE33'-7Qxq58]E!HM 4  B
 	B4    OOHK{###I	 ! !<;&&t__FMM6:;;;CJJtz"""c((Q,,

B4    c((Q,,3xx!||

23444;(c4;aaTYaZ`aaD$    OOFIYKr.   is_beam_searchswitch_attentionc                 
   |rg }t          | j                  D ]\  }}|                    |g           |                    t          j                            dt          j        j        dg          g           |                    t          j                            dt          j        j        g d          g           |                     d           | j                            |           |rg d}g }| j	        D ]Q}|j
        dk    r,t          |          }	|	                                D ]7}
|
d	k    r  d
S |
|vr'|
dk    rt                              d|
 d           |	|
= 8g }|                    |j                   |rt          |          dk     r)|                    dg           t          |          dk     )t          |          dk     r|                    dg           t          |          dk     r|                    dg           t          j        j        d||j        fd|j        i|	}|                    |g           S|                     d           | j	                            |           dS )aS  Update the Attention nodes to DecoderMaskedSelfAttention.

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
        is_beam_search (bool): Boolean specifying if the sampling algo is BeamSearch
        switch_attention (bool): Boolean specifying if `Attention` is to be switched with `DecoderMaskedSelfAttention`
    
beam_widthr_   r|  cache_indirection
batch_sizer  ry  r   rY   	num_headsscalemask_filter_valuerh  r}  qkv_hidden_sizesFunidirectionalzRemoving attribute: zB from Attention node while switching to DecoderMaskedSelfAttentionrY  r"   r   rZ  DecoderMaskedSelfAttentionr   r5  T)r  r   r   r   r:  r;  r   r  r~  r5  r   rm  copyr   r   r   r  r   r   )rw  r  r  r  _iru  'decoder_masked_attention_supported_attrr  r5  rj  kr  s               r,   4update_decoder_subgraph_use_decoder_masked_attentionr  o  s     &

++ 	$ 	$FBrd#### 	4;==lDL\Lbkljm=nnoppp22')9)?GrGrGr 3  	
 	
 	
 	   
*%%% 0$3
 3
 3
/ 	I $	% $	%D|{**"4 & &A ...$uuu GGG  000"NN |q | | |   #1I

4:&&& " :c((Q,,

B4((( c((Q,,3xx!||

L>2223xx!||

$7#8999{,0#t{ IMV\  dV$$$$	###4r.   c                 <   t                      }g }d t          | j                  D             }i }i }| j        D ]E}|j        D ]*}|r&||vr|g||<   ||                             |           +|j        D ]	}|r|||<   
F| j        D ]}|j        dk    r|j        d         r|j        d         s*|j        d         |j        d         }
}	d}| j        D ]}|j        |
k    r|} n|bt          j
                            |          }|j        dk    r|                                dk    r|j        d         |v r||	         }|j        dk    r|j        d         r|j        d         |v r|j        d                             d          s |j        d                             d	          rn|                    |j        d                    |                    |           t!          ||j        d                            dk    r|                    |           ||fS )
az  Correct graph which originally use dim of past_seq_len from input_ids's shape which is fixed to max_seq_len after
       shared past/present buffer

    Args:
        subg (GraphProto): GraphProto of the decoder subgraph
    return:
        tensor_names_to_rename : set of tensor names which is equal to past_sequence_length
        nodes_to_remove : list of node to remove
    c                 $    i | ]\  }}|j         |S r5   r   )r   indexinps      r,   
<dictcomp>z+find_past_seq_len_usage.<locals>.<dictcomp>  s     QQQZUC5QQQr.   Gatherr_   r   Nr   Shaper  r  )setr  r   r5  r   r   r   r2  r   r   r8  r   sizeitem
startswithaddr   )rw  tensor_names_to_renamenodes_to_removegraph_input_namesinput_name_to_nodesr   r5  
input_nameoutput_nameshape_tensor_nameshape_index_nameini_gather_indicesrT  gather_indices_arr
shape_nodes                  r,   find_past_seq_len_usager    s    !UUOQQ9TZ;P;PQQQ	 	8 	8* 	A 	AJ A%8887;f'
33'
3::4@@@; 	8 	8K 837#K0	8 	 ; ;<8##:a= 
1 37:a=$*Q-/!%*  ;"222)/&E 3 ")!%!2!;!;<N!O!O!&!++0B0G0G0I0IQ0N0NSWS]^_S`dwSwSw01BC
&'11"(+ 2"(+/@@@"(+667GHH A &+A.99:LMM A +..t{1~>>>#**4000.z/@/CDEEJJ'..z:::!?22r.   r_   re   r   	attn_maskkv_num_heads
world_sizewindow_sizec                    |                      t          j                            dt          j        dgdg                     t          j                            d|dg|dz   g|                     d                    }t          j                            d|dz   dgdg|                     d                    }t          j                            d	dgd
g|                     d	          t          j                  }t          j                            d|g|dz   g|                     d                    }t          j                            d|dz   dgdg|                     d          d          }	t          j                            d	dgdg|                     d	          t          j                  }
| j	        j
        j                            |||||	|
g           t          t          d | j	        j
        j                            }t          |          D ]\  }}|                     |g dg d          }|                     |ddgddg          }d\  }}}||\  }}}n||\  }}|                     |g dg d          }|                     |ddgddg          }d\  }}}||\  }}}n||\  }}|                     |ddgddg          }|                     |dgdg          }d\  }}||\  }}n
||d         }d}|||j        D ]}|j        dk    r|j        }d}|j        D ]}|j        dk    r|j        }|j        d         |j        d         k    o|j        d         |j        d         k    }|d uo|d uo|d u} |d u o|d u o|d u }!d\  }"}#}$|r| s|!rt+          j        |                     |j        d                             }%t+          j        |                     |j        d                             }&t+          j        |                     |j        d                             }'|%j        d          }(t3          j        |%|&|'fd!                              |(d"|(z            })t          j                            |)d#| $          })|                      |)           t          j                            d|j        d         |)j        g|)j         d%g|                     d                    }*| j	        j
        j                            |*g           | j	        j
        j                            |           | j	        j
        j                            |           | j	        j
        j                            |           |*j        d         }"| rt+          j        |                     |j        d                             }+t+          j        |                     |j        d                             },t+          j        |                     |j        d                             }-|+j        d          }(t3          j        |+|,|-fd!                              d"|(z            }.t          j                            |.d&| $          }.|                      |.           t          j                            d|*j        d         |.j        g|.j         d%g'          }/| j	        j
        j                            |/g           | j	        j
        j                            |           | j	        j
        j                            |           | j	        j
        j                            |           |/j        d         }"n'|j        d         }"|j        d         }#|j        d         }$t          j                            d(|"|#|$|j        d)         |j        d*         |j        d         |
j        d         ||j        d         nd+||j        d"         nd+g	|j        |j                             d,d(          d-||z  |dk    r||z  n||z  |tC          |d uo|d u          |.
  
        }0| j	        j
        j                            |           | j	        j
        j                            |0g           |$| j	        j
        j                            |           |$| j	        j
        j                            |           | S )/Noner_   r   r   r   vals	ReduceSum	_row_sumsinputsoutputsr   Subseqlens_k_int64Cast	seqlens_k)r  r  r   tor  _shaper  total_seq_len_int64r   )r  r  r   r   total_seq_lenc                     | j         dk    S )NMultiHeadAttention)r   )r5  s    r,   <lambda>z&replace_mha_with_gqa.<locals>.<lambda>V  s    9M)M r.   )RotaryEmbeddingAddr   )r   r   r   r  r   )NNN)r_   r   r   r  r   NNinterleavedr  )r"   r"   r"   re   r   r   QKV_Weight_r  _output	QKV_Bias_)r  r  GroupQueryAttentionrX  rY  r"   r  com.microsoft)	r  r  r   rh  r  r  local_window_size	do_rotaryrotary_interleaved)"add_initializerr   r:  make_tensorr   INT64r  create_node_namer  r   r   r5  r   listfilterr  match_parent_pathre  r   r   r   r   r   r   r9  r   stackreshaper8  
from_arrayr6  r   replacerx   )1r   r  r  r  r  reduce_sum_nodesub_nodeseqlen_k_cast_noder  gather_nodetotal_seqlen_cast_node	mha_nodesidxr5  q_path_1q_path_2q_rotaryq_addq_matmulk_path_1k_path_2k_rotaryk_addk_matmulv_path_1v_path_2v_addv_matmulr  attr  root_input_is_sameall_paths_have_biasall_paths_have_no_biasq_input_to_attentionk_input_to_attentionv_input_to_attentionqwkwvwrt  
qkv_weightpacked_matmul_nodeqbkbvbqkv_biaspacked_add_nodegqa_nodes1                                                    r,   replace_mha_with_gqar	    s	    
!'	 	  	
 	
   k++5![()##K00	 ,  O {$$K'/"###E**	 %  H ..!"##F++ /   &&{X%&##G,,	 '  J +''H$e,&'##H-- (  K "[22%& !##F++ 3   
K!!	($6
KQgh  : VMMu{O`OeffggIy)) B4 B4	T**41U1U1UW`W`W`aa**42CX1NQRTUPVWW$4!%(0%HeXX!!)Hh **41U1U1UW`W`W`aa**42CX1NQRTUPVWW$4!%(0%HeXX!!)Hh **4%1BQFKK**4(aSAA$x&OE88!{H H$8) ( (8},,"%%K 	> 	" 	"Cx;&&E	 &^A.(.2CCnWXHY]e]klm]nHn $4/[E4E[%W[J[!&$!R5D=!RUd] LVH24H .	6#6 .	6:P .	6%e&;&;HN1<M&N&NOOB%e&;&;HN1<M&N&NOOB%e&;&;HN1<M&N&NOOB(2,C2r2,Q777??QWMMJ*55jGZUXGZGZ5[[J!!*---!%!6!6 q):?;&O4445++H55	 "7 " " K"))+=*>???K"))(333K"))(333K"))(333#5#<Q#?  # A )%*?*?A*O*OPP )%*?*?A*O*OPP )%*?*?A*O*OPPhrl8RRLq999AA!c'JJ,77GXSVGXGX7YY%%h///"&+"7"7.5a8(-H (6667 #8 # #
 !&--.?@@@!&--e444!&--e444!&--e444'6'=a'@$ $,?1#5 #+?1#5 #+?1#5  ;((!$$$
1
1")!,&-a0%-%9q!!r%-%9q!!r
 K""#79NOO":-4@A4E4Ej00<[eKe)($.G843GHH*) ) 
 
, 	%%d+++%%xj111K"))(333K"))(333Lr.   c           	          d}d  j         D             }|dk     rA||                             d          s&|dz  }|dk     r||                             d          &d}t           j                  |z
  dz  }d|z  |z    fdt	          |          D             }t          d|            t           j                            }t          d|            |d	         }|d         }|d         }	d	}
 j        D ](}|j        d
k    r|j         d         |v r	t          d|j	         d|j                    |
dz  }
||j         d                  }d| }dgdt          |j                  z
  z  }|
                    |           |j                            |           |j                            t          j                            dd          g           t          j                            |t"          j        ||d|	g          } j                            |g           *|
|k    rt'          d| d|
           d S )Nr_   c                     g | ]	}|j         
S r5   r  r   gis     r,   r   zBupdate_decoder_subgraph_output_cross_attention.<locals>.<listcomp>      666R666r.   r   pastr   c                 B    i | ]}j         |d z  z            j        |S )r   )r   r   )r   layerinput_cross_past_0rw  s     r,   r  zBupdate_decoder_subgraph_output_cross_attention.<locals>.<dictcomp>  s0    sssX]TZ	4F(FGLesssr.   z    --past_key_cross_inputs=zpast_key_cross_0_shape is r   DecoderMaskedMultiHeadAttentionz'    -- add cross QK output from: node: z with output: output_cross_qk_r"   	output_qkz#Did not add cross QK for all layersz vs )r   r  r   r   r  printrv  r5  r   r   r   r   re  r   r:  make_attributer;  r   r	  r  )rw  input_self_past_0r  output_self_present_0
num_layerspast_key_cross_inputsinput_past_key_cross_0_shapebatch_size_dimnum_heads_dimcross_seq_len_dimnum_layer_output_qkr5  r  cross_attention_out_nameappended_namescross_attentionr  s   `               @r,   .update_decoder_subgraph_output_cross_attentionr$    s   664:666
a

(9:K(L(W(WX^(_(_
Q a

(9:K(L(W(WX^(_(_
dk""%::q@JZ*;;sssssafgqararsss	
@)>
@
@AAA#+DJ7I,J#K#K 	
E'C
E
EFFF1!4N03M4Q7	 2 2L===DJqMUjDjDjbDIbbUYU`bbccc1$)$*Q-8E'A%'A'A$ TQT[)9)9%9:N!!":;;;K~...N!!4;#=#=k1#M#M"NOOO"k@@(+*;nm]^`q=r O K0111j((dzddObddeee )(r.   c           
      x   d}d | j         D             }|dk     rA||                             d          s&|dz  }|dk     r||                             d          &d}t          t          | j                   |z
  dz            }d|z  |z   }g }g }| j        D ]#}|j        dk    r|                    |g           $t          |          |k     rdS d }	| j        D ]}|j        d	k    r|}	 ng d
}
d}t          |           \  }}t          |          dk    r|D ]}t          d| d|            |D ]!}t          d|j         d|j	                    "t          j                            ddgdgd          }t          j                            ddg|gdt          j                  }|                    ||g           | j        D ]}t          |j                  dk    ry|	w|j        d         |	j         d         k    r[t          j                            ddgdgdt          j                  }|j        d         |j         d<   |                    |g           |j        dk    rt!          |          }|                                D ]	}||
vr||= 
|j         d         |j         d         |j         d         g}|                    t          |j                   dk    r|j         d         ndg           |                    t          |j                   dk    r|j         d         ndg           |                    t          |j                   dk    r|j         d         ndg           |                    t          |j                   dk    r|j         d         ndg           |                    dg           |                    dg           |                    d g           |                    t          |j                   dk    r|j         d         ndg           d|d!<   t          j        j        d"||j        fd#|j	        i|}||vr>t%          |j                   D ]\  }}||v r
||j         |<   |                    |g           |                     d$           | j                            |           d% | j         D             }g }t%          | j                   D ]\  }}||k    ra||k     r[t)          |          }t          j                            |j	        |j        j        j        |d         |d         d&|d         g'          }|                    |g           d|vrF|                    t          j                            dt          j        j        dg(          g           d|vrF|                    t          j                            dt          j        j        dg(          g           d |vrG|                    t          j                            d t          j        j        g d)(          g           |                     d*           | j                             |           g }t%          | j                  D ]|\  }}||k    r[t)          |          }t          j                            |j	        |j        j        j        |d         |d         d&|d         g'          }|                    |g           }|                     d+           | j                            |           d,S )-Nr_   c                     g | ]	}|j         
S r5   r  r  s     r,   r   zSupdate_decoder_subgraph_share_buffer_and_use_decoder_masked_mha.<locals>.<listcomp>  r  r.   r   r  ra   r   r  FRelativePositionBiasr  #past_sequence_length_squeezed_int64r   zFound tensor name z to be renamed to zFound node to removed: type:z, name:Squeezer{  past_sequence_length_squeezed!node_past_sequence_length_squeezer  r  &node_past_sequence_length_squeeze_cast)r   r  past_sequence_length_int64past_sequence_length_castr"   rW  rX  rY  r  r  rY   r  r   r5  c                     g | ]	}|j         
S r5   r  )r   r  s     r,   r   zSupdate_decoder_subgraph_share_buffer_and_use_decoder_masked_mha.<locals>.<listcomp>q  s    777S777r.   ry  rz  r|  r  r   r   T)r   r  rx   r   r5  r   r   r  r  r   r   r:  r  r   r  r   rm  r  r  r~  rv  r;  r<   r
  r  r  )rw  r  r  output_self_past_0r  r  r  	old_nodesr5  rel_pos_bias_noder  target_squeezed_past_seq_namer  r  name_to_renamenrsqueeze_node	cast_noderj  r  r  r  r   orig_input_namesr  r   ru  r9  r  s                                r,   ?update_decoder_subgraph_share_buffer_and_use_decoder_masked_mhar9    s   664:666
a

(9:K(L(W(WX^(_(_
Q a

(9:K(L(W(WX^(_(_
c$*oo(99Q>??JZ*;;II	 % %<///dV$$$ 9~~
""u 	  <111 $E 2/ / /+ %J!.Ed.K.K+O
!""Q&&4 	j 	jNh~hhIfhhiiii! 	O 	OBMMMBGMMNNNN{,,#$,-4	 - 
 
 K)),-*+9  * 
 
	 	,	2333	 ,% ,%t{a$5$AdkRSnXiXopqXrFrFr--'(-.0$ .  I &,Q/DJqMi[)))<///t__F[[]] " "CCCq	 
1
1
1C JJTZ1)<)<
1"EFFFJJTZ1)<)<
1"EFFFJJTZ1)<)<
1"EFFFJJTZ1)<)<
1"EFFFJJ./000JJ~&&&JJ+,---JJTZ1)<)<
1"EFFF23F./;(13 JN)W] D &&(44 F Ft111(EDJu%dV$$$OOFIY77DJ777J4:&&    2!!!a*<&<&<RLLE33'-7Qxq=%(C 4  B
 	2$%555[//0FHXH^ghfi/jjk	
 	
 	
 +++4;==lDL\Lbkljm=nnoppp"22222')9)?GrGrGr 3  	
 	
 	
 	OOGJj!!!K4;'' ! !2"""RLLE33'-7Qxq=%(C 4  B
 	B4    OOHK{###4r.   model_protoc                    t          |           }|                                }g }g }|                                D ]m}|j        dk    r^d|j        d         v rd|j        d         v r.||j        d                  }||j        d                  }||j        d                  }|                    |j        d                   }	|                    |j        d                   }
|                    |j        d                   }|	r|
r|s dS t          j        |	          }t          j        |
          }t          j        |          }t          j	        |||gd          }|
                    d	d
          }t          j                            |dz   |	j        dk    rt          j        nt          j        |j        d         |j        d         g|                                                                          }| j        j                            |g           t          j                            d	|j        d         |dz   g|dz   g|          }|j        d         |j        d<   d|j        d<   d|j        d<   |                    |g           |                    |||g           o|                    |           |                    |           |                                 |                                 dS )Nr  past_key_crossr_   past_value_crossr   r   Fr   r   
MatMul_QKV)name_prefix_weightr  _outr  r"   T)r   r   nodesr   r   r   r   r   r   r   r  r   r:  r  r   r   r	  rw   r9  flattentolistr   r2  r   r  r   	add_nodesremove_nodesupdate_graphtopological_sort)r:  
onnx_modelr   nodes_to_addr  r5  r  r  r  q_weightk_weightv_weightr  r  r   r  matmul_node_nameweightr   s                      r,   pack_qkv_for_decoder_masked_mharP    s   ;''J$88::LO  "" *C *C<<<<4:a=005G4:VW=5X5X*4:a=9H*4:a=9H*4:a=9H!11(.2CDDH!11(.2CDDH!11(.2CDDH  h uu%h//B%h//B%h//BR1===J)::8Q]:^^[,,%	1/7/AQ/F/F+++KL_ &q):+;A+>?''))0022	 -  F )00&:::+// q)+;i+GH)F23%	 0  K (.q1DJqMDJqMDJqM...""Hh#ABBB&&&O,,,!!!4r.   decoder_onnx_pathc                    t          j        | d          }t          t          |j        j                            D ]}|j        j        |         j        dk    s|j        j        |         j        dk    r\|j        j        |         j        j        j	        j
        d         }|                    d          r|                                 d|_        t          j        || |           dS )aQ  Update the input shapes for the inputs "input_ids" and "position_ids" and make the sequence length dim value 1 for each of them.
       The decoder model will be over-written.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   r_   rp  r   )r   r   r  r   r   r   r   r<   r
  r9  rt  HasFieldClearrq  r   r   )rQ  rP   r   r   shape_dim_protos        r,   *update_input_shapes_for_gpt2_decoder_modelrV    s     /*;PTUUU3*067788 * *%+A.3{BB"(.q16.HH17=a@EQW[\]^O ''44 (%%''' )*O%N&(9Qijjjj4r.   init_decoder_onnx_pathc           	      P	   t          j        | d          }|j        j        d         j        }t          |          }|                                }||v sJ ||         }|j        dk    rdS |                    |g dg d          }||                    |g d	g d
          }|8|                    |g dg d          }||                    |g dg d          }|dS |d         }	|	j        dk    }
|
sd}|                    |	g d|dddg          }|d}|                    |	g d|dddg          }|d}|                    |	g d|ddg          }|d}|                    |	g d|ddg          }n|d}|                    |	g d|ddg          }|d}|                    |	g d|ddg          }|d}|                    |	ddg|dg          }|d}|                    |	ddg|dg          }|dS |dk    rdnd}|
s|	                    |	d|          }n|	                    |	d|          }|dS |d         }|d         }t           j
                            dt          j        dgdg          }t           j
                            dt          j        dgdg          }t           j
                            dt          j        dgdg          }t           j
                            dt          j        dgdg          }|                    |           |                    |           |                    |           |                    |           d|j        d         z   }t           j
                            d|j        d         ddddg|g|                    dd                     }|
s|j        d         n|j        d!         }d|j        d         z   }t           j
                            d|ddddg|g|                    dd"                     }|                    |           |                    |           |                    ||j        d         |           |                    |	||           |                                 t          j        |||#           dS )$a  Generates the initial decoder GPT2 subgraph and saves it for downstream use.
       The initial decoder model will be saved to init_decoder_onnx_path.

    Args:
        decoder_onnx_path (str): Path of GPT-2 decoder onnx model
        init_decoder_onnx_path (str): Path of GPT-2 init decoder onnx model
        use_external_data_format(bool): output tensors to external data or not.
    Tr   r   r   F)r  LayerNormalizationr  r  r  r   r  FastGelur  r   r  rY  r  )r   r   r   r_   r   r   r   r   r   r   r   r   r   N)
r  SkipLayerNormalizationr  r   r  rZ  r  r   r  r[  )
r   r   r_   r   r   r   r   r   r   r   )rY  r  r  r   rZ  r   rY  r  )r   r   r_   r   r   r   r   r   )r[  r   rZ  r   r[  )r   r_   r   r   r   re   r[  )r  r  r   r}  r_   )r  r   r}  )r  r   r}  r}  r  SliceLastTokenStartsr  SliceLastTokenEndsSliceLastTokenAxesSliceLastTokenStepsedge_modified_SliceGatherLastToken_0_r  r   GatherLastToken_1_r   )r   r   r   r   r   r   r   r   r  r   r:  r  r   r  r  r  r  add_nodereplace_node_inputrH  r   )rQ  rW  rP   init_decoder_model_protor   gpt2_init_decoder_modelr   logits_matmul_node"logits_matmul_to_residual_add_pathresidual_add_nodeis_skiplayernorm_path&residual_add_to_attention_parent_indexresidual_add_to_attention_path residual_add_to_add_parent_indexadd_before_residual_add	attentionmatmul_after_attentionslice_starts
slice_ends
slice_axesslice_stepsslice_0_output_nameslice_node_0add_before_residual_add_outputslice_1_output_nameslice_node_1s                             r,   generate_gpt2_init_decoderr|    s     $/@UYZZZ17>qAF'(@AA1EEGG!44444,-?@ !X--u *A)R)R	
 	
 	
 	0//#* *&* *1-D-V-V   +**.
 .
*$ *1-D-V-Vmmm$$$.
 .
* .51H1Z1Z"    
2 
2. *1u:2> .59QQ ! 712.)@)R)REEEHnpqstvwGx*
 *
&
 *1562-D-V-V!6667AqA. .* *1562-D-V-V!#A#A#ADjlmopCq. .*
 *1562-D-V-V!#A#A#ADjlmopCq. .* 23.)@)R)R>>>Agijlm@n*
 *
&
 *1562-D-V-V!#B#B#BEkmnpqDr. .*
 *1562-D-V-V!Hk#:=cef<g. .*
 *1562-D-V-V!Hk#:=cef<g. .*
 &-u,RVW,W,Wqq]^$ ! 	
"9"F"Fu&F#
 #
 #:"F"F79Y#
 #
 &u.r2I;B?;**##ST	 +  L ((!#ST	 )  J ((!#SS	 )  J +))"#ST	 *  K ++L999++J777++J777++K888 +Y-=a-@@;((Q"  !
 %%$55g?STT )  L" 2Gm&q))LcLjklLm # +-D-KA-NN;((*"  !
 %%$55g?STT )  L $$\222$$\222 ../EyGWXYGZ\oppp../@B`buvvv ,,... N+-C[stttt4r.   c                    t          d          }t          |j                  }t          |j                  }t          |j                  }| j        j        D ]l}|j        j        j        j	        D ]S}|
                    d          r<|j        ||||fv r/t          |j                  }|                                 ||_        Tm| j        j        D ]l}|j        j        j        j	        D ]S}|
                    d          r<|j        ||||fv r/t          |j                  }|                                 ||_        TmdS )zoMake dim_proto numeric.

    Args:
        model: T5 encoder and decoder model.
        config: T5 config.
    r_   rp  N)rq   r  d_modeld_kvr   r   r<   r
  r9  rt  rS  rp  rx   rT  rq  r   )	r   configsequence_lengthr  hidden_size	head_sizerT  	dim_protorq  s	            r,   make_dim_proto_numeric_t5r    ss    !ffOF$%%Ifn%%KFK  I+$ 
0 
006: 		0 		0I!!+.. 093F	K 4 4  	 344	!!!&/	#		0 +# 
0 
006: 		0 		0I!!+.. 093F	K 4 4  	 344	!!!&/	#		0
0 
0r.   generation_typec                 (   | j         dk    }|t          j        k    }|t          j        k    }|t          j        k    }| j        }t                              d|            t          | j	                  dk    ru| j	        d         dk    rd|r[| j
        t          j        k    rFg d| _	        t                              d| j	                    t                              d           ng | _	        |s|r=|st          d	          | j        rt          d
          | j        rt          d          |r|r| j        st#          d          | j        r|st#          d          | j        r| j        st#          d          |r| j        rHt(          j                            | j                  r$t                              d| j                    n| j        spd                    | j        | j
        t          j        k    rdnd          }t3          t3          | j                  j        |                                          | _        t                              d| j         d| j         d           t;          |            nk| j        r2| j        r+t                              d| j         d| j                    n2t                              d| j         d           t?          |            d}| j         sv| j
        t          j        k    ra|r_|s|s|rYt                              d| j         d           tC          | j        | j"                  }|st          #                    d           d}	d}
| j$        s|r|s|s|rt                              d| j         d            d!                    | j
        t          j        k    rdnd          }t3          t3          | j                  j        |                                          }
tK          | j        |
| j"                  }	|	st          #                    d"           |	r)tM          | j        | j"                  st#          d#          |s	| j'        s|	rrt                              d$| j         d           tQ          | j        | j"                   |	r3t                              d$|
 d           tQ          |
| j"                   |r!tS          j*        | j        | j+        %          }nL| j         d&k    r!tY          j*        | j        | j+        %          }n t[          j*        | j        | j+        %          }| j.        rt                              d'|            |j/        }|r|j/        n|j0        }|j1        }| j1        d(k    r| j1        }| j/        d(k    r| j/        }| j0        d(k    r| j0        }te          j3        | j        d)*          }| j          d+|j4        _5        d}| j         dk    ratm          |j4        | j
                   |	rDte          j3        |
d)*          }| j          d,|j4        _5        tm          |j4        | j
                   nto          |j4        | j
                   d}|rg d-}n|s|rg d.}| j8        r|9                    d/           n|9                    d0           | j:        r|9                    d1           n|9                    d0           | j;        r|9                    d2           n|9                    d0           |rU| j<        r| j=        r|9                    d3           n|9                    d0           | j>        r|9                    d4           d5g}| j        r|9                    d6           | j        r&| j        s
J d7            |9                    d8           d}|r,td          j?        @                    d9||d:| j          ;          }n[|r,td          j?        @                    d<||d=| j          ;          }n-|r+td          j?        @                    d>||d?| j          ;          }d@|_A        d}|rtd          j?        B                    dA|          td          j?        B                    dB|          td          j?        B                    dC| jC                  td          j?        B                    dD| jD        rdnd          td          j?        B                    dE| j         dk    rdnd          g}n|rtd          j?        B                    dA|          td          j?        B                    dB|          td          j?        B                    dE| j         dk    rdnd          td          j?        B                    dC| jC                  g}nk|rhtd          j?        B                    dA|          td          j?        B                    dB|          td          j?        B                    dE| j         dk    rdnd          td          j?        B                    dC| jC                  td          j?        B                    dF| jE                  td          j?        B                    dG| jF                  td          j?        B                    dH| jG                  td          j?        B                    dI| jH                  td          j?        B                    dJ| j<                  td          j?        B                    dK| jI                  g
}|r4|J                    td          j?        B                    dL|          g           |jK        J                    |           g }| j         dMv ra| j'        r=t                              dN| j         d           tQ          | j        | j"                   te          j3        | j        d)*          }| j          dO|j4        _5        t          |j4        | j
                   t          ||           t          ||           |r| j        st#          dP          t                              dQ           t          |j4                  rt                              dR           nt                              dS           t          |          rt                              dT           nt                              dU           | jP        sGt          ||          }t                              t          |           dVdW |D              dX           |jK        J                    td          j?        B                    dY|j4                  td          j?        B                    dZ|j4                  td          j?        B                    d[t          |j4        jR                  d\k    r|jS        nd(          g           n|	r| jP        sGt          ||          }t                              t          |           dVd] |D              d^           |r.t                              d_           t          |j4                   | j        r%t          |j4        |d          st#          d`          |jK        9                    td          j?        B                    da|j4                             n>t          |j4                  }t                              t          |           db           |r.t                              dc           t          |j4                   | j        r%t          |j4        |d)          st#          dd          |jK        9                    td          j?        B                    dZ|j4                             td          j?        W                    det          jY        dfdgg          }td          j?        W                    dht          jY        dg          }td          j?        W                    dit          jY        dg          }td          j?        W                    djt          jY        dg          }td          j?        W                    dkt          jY        dg          }td          j?        W                    dlt          jZ        dg          }td          j?        W                    dmt          jZ        dg          }d}|r
|||||||g}n
|s|r||||g}| j8        rAtd          j?        W                    d/t          jY        |g          } |9                    |            | j:        rBtd          j?        W                    d1t          jY        df|g          }!|9                    |!           | j;        rBtd          j?        W                    d2t          jY        dfdgg          }"|9                    |"           | j<        rI| j=        rBtd          j?        W                    d3t          jY        df|g          }#|9                    |#           |rH| j>        rAtd          j?        W                    d4t          jY        dg          }$|9                    |$           d}%|r.td          j?        W                    d5t          jY        g dn          }%n1|s|r-td          j?        W                    d5t          jY        dfdhg          }%|%g}&| j        rBtd          j?        W                    d6t          jZ        dfdkg          }'|&9                    |'           | j        rDtd          j?        W                    d8t          jZ        dodfdj|g          }(|&9                    |(           td          j?        [                    |g|s
| j          dpn	| j          dq||&|          })td          j?        \                    |)dr|j]        s          }*| j"        rtddtl^m_}+ |+`                    td          ja                  |+`                    du          k     rt          #                    dv           t          jc        |*| j        d)d)w           nte          jc        |*| j                   t                              dx| j                    dS )yzConvert model according to command line arguments.

    Args:
        args (argparse.Namespace): arguments parsed from command line
    r>   z**** past_present_share_buffer=r_   r   rM   )r  rY  r[  rZ  z**** Setting op_block_list to zI**** use --op_block_list if you want to override the block operator list.z<Currently only gpt2 with greedy search/sampling is supportedzLoutput_sequences_scores currently is not supported in greedy search/samplingzHoutput_token_scores currently is not supported in greedy search/samplingzi`use_decoder_masked_attention` MUST be turned on to use `past_present_share_buffer` in case of BeamSearchzS`past_present_share_buffer` MUST be turned on to use `use_decoder_masked_attention`z?`use_decoder_masked_attention` option is only supported on GPUsz)skip convert_to_onnx since path existed: z{}_past_{}.onnxr   r   zConvert GPT model z	 to onnx z ...z,skip convert_to_onnx since paths specified: z and zConvert model z to onnx ...Fz=Pad logits MatMul weights for optimal MatMul perf in fp16 on z. The file will be overwritten.z]Tried and failed to pad logits MatMul weights. Performance may be sub-optimal for this MatMulNz*Creating an initial run GPT2 decoder from z. zgpt2_init_past_{}.onnxzuTried and failed to generate the init decoder GPT2 model. Performance may be sub-optimal for the initial decoding runzGCould not update the input shapes for the non-initial decoder subgraph.z Run symbolic shape inference on r   r?   zConfig=re   Tr   z decoderz init decoderr   
max_length
min_length	num_beamsnum_return_sequenceslength_penaltyrepetition_penaltyr   r  r  r  rX   r"   r[   r   r]   r^   	sequencessequences_scoresz8--output_token_scores requires --output_sequences_scoresscores
BeamSearchBeamSearch_r  GreedySearchGreedySearch_Sampling	Sampling_r  eos_token_idpad_token_idno_repeat_ngram_sizerW   r   temperaturetop_pfilter_valuemin_tokens_to_keepcustompresence_penalty
vocab_sizer?   r@   zSymbolic shape inference on z encoder and decoder initzMpast_present_share_buffer is only supported with use_decoder_masked_attentionzl*****update t5 decoder subgraph to share past/present buffer and use decoder_masked_multihead_attention*****z4*****update t5 decoder subgraph successfully!!!*****zF*****DecoderMaskedMultiHeadAttention is not applied to T5 decoder*****z9*****pack qkv for decoder masked mha successfully!!!*****z3*****pack qkv for decoder masked mha failed!!!*****z shared initializers (c                     g | ]	}|j         
S r5   r  r   s     r,   r   z,convert_generation_model.<locals>.<listcomp>c	  s    <Z<Z<ZQV<Z<Z<Zr.   z>) in encoder and decoder subgraphs are moved to the main graphrN  rO  decoder_start_token_idr   c                     g | ]	}|j         
S r5   r  r   s     r,   r   z,convert_generation_model.<locals>.<listcomp>	  s    @^@^@^A@^@^@^r.   zC) in decoder and init decoder subgraphs are moved to the main graphzY*****update init decoder subgraph to make past and present share buffer******************zLCould not update the init decoder subgraph to use DecoderMaskedSelfAttentioninit_decoderz: initializers from the decoder are moved to the main graphzT*****update decoder subgraph to make past and present share buffer******************zGCould not update the decoder subgraph to use DecoderMaskedSelfAttentionr   r  r  r  r  r  r  r  r  )r  r  r  zmax_length - sequence_lengthz beam searchz greedy searchzonnxruntime.transformers)producer_nameopset_imports)versionz1.12.0z0Require onnx >= 1.12 to save large (>2GB) model!)r   all_tensors_to_one_filezmodel save to )dr   r$   r2   r3   r4   rY   r   r   r   r   r   r
   rw   NotImplementedErrorrU   rV   rZ   r  rh   r   rs   rt   existsformatr   r   r   r   as_posixr   r   r   rR   r   rP   r   rS   r|  rV  rQ   r   r   from_pretrainedr   r   r   rH   r  r  r  r   r   r   r   r  r!  rX   r   r[   r\   r  r]   r^   r:  r  rh  r  r  rW   r  r  r  r  r  r   re  r(  r  r9  rP  rT   rQ  r   r  r  r  rU  r;  r   r  r	  
make_graph
make_modelopset_import	packagingr  parse__version__r   r   ),r   r  is_gpt2is_beamsearchis_greedysearchis_samplingrY   onnx_filenamelogits_matmul_weight_paddedgpt2_init_decoder_generatedgpt2_init_decoder_onnx_pathgpt2_init_decoder_onnx_filenamer  r  r  r  r   rh  r  r  r5  attr_to_extendrP  rG  r   r  r  r  r  r  r  graph_inputsrX   r[   r   r]   r^   r  graph_outputsr  r  	new_graph	new_modelr  s,                                               r,   convert_generation_modelr  &  sT    Ov-G)^-FFM+~/JJO'>+BBK&*&D
KKM2KMMNNN
4!##(:1(=(G(G 	$t~):::!d!d!dDKKM9KMMNNNKKcdddd!#D r+ r 	f%&deee' 	v%&tuuu# 	r%&pqqq ! 
] 
4;\ 
w
 
 	
 ( p1J pnooo ( \ \Z[[[  
	0A!B!B 
	KKWDDUWWXXXX$ ] 1 8 8+t~IZ7Z7ZVV`f! ! %)dk):):)A=$Q$Q$Z$Z$\$\!KKfT-DfftO`fffggg 	!? 	KKwt?PwwW[Wuww    KKN)@NNNOOOt #('Ni/// 0 0- 01< 0 	,DL] , , ,	
 	
 	
 'C4CTVZVs&t&t#* 	NNo   #("&;hh h .h 2=h
 	VARVVVWWW*B*I*In	(999FFv+
 +
' '+4+<+<+CEd&e&e&n&n&p&p#&@:D<Y'
 '
# + 	NNN   ' 	h/Yt<0
 0
 	h fggg
 # Xd&> XB] Xit7Hiiijjj)4+HIII& 	XKKw;Vwwwxxx79VWWW ^+D,Ct~^^^	D	 	 )$*AT^\\\*4+Bdn]]]| (&f&&'''&L*1J6&&v7JL"J "_
B(B(OD$5$OOOM"&/;;;M"&  ]0$.AAA ' 	P&*o6Qfj&k&k&k#48O1R1R1R#). !8!>OOO"=#6GGGF 

 
 
 
 
K 

 
 
  l####b )****b! &''''b "; 	4- 	MM/****MM"9 	"MM&!!!mG# +)*** !+gg-ggggx   D 
{$$0t00	 % 
 
 
 
{$$222	 % 
 
 
 
{$$.T_..	 % 
 
 "DKN 
K&&~|DDK&&~|DDK&&'=t?XYYK&&'7d>Q9XWXYYK&&|$/V:S:SQQYZ[[
 
 
K&&~|DDK&&~|DDK&&|$/V:S:SQQYZ[[K&&'=t?XYY	
 
 
K&&~|DDK&&~|DDK&&|$/V:S:SQQYZ[[K&&'=t?XYYK&&}d6FGGK&&w
;;K&&~t7HIIK&&';T=TUUK&&x==K&&'94;PQQ
 # Vt{99,
SSTUUUN.)))L-''# 	[KKvt7UvvvwwwD:D<YZZZ(F[_```&*o#P#P#P /0CT^TTT!-888!-888 % 	S4 r !pqqqKK~   O}Obcc fRSSSSdeee.}== SWXXXXQRRR/ 	2=-PPLKK|$$  [  [<Z<Z\<Z<Z<Z  [  [  [   	**9m6IJJ**9m6IJJ**,589L9R5S5SWX5X5XF11^` 		
 		
 		
 		
 ' 	j 3 67NP]^^<((  d  d@^@^Q]@^@^@^  d  d  d  
 ) awxxxABYB_``` 0 q9m'-}e: : q !!opppN!!$+"<"<^MdMj"k"kllll -]-@AALKK3|,,hhhiii % 	SKKnooo=m>QRRR , 	h5i6
 6
 	h fgggdk88MDWXXYYY 22;@QT`bsStuuI33L+BSVWUXYYJ33L+BSVWUXYYJ22;@QTUSVWWI;==>TVaVgjkilmm[778H+J[^_]`aaN;;<PR]RcfgehiiL 
 
 
 
K 
	
  ([77kFWZdYeff
J''' / K>>!2\:4N
 
 	-...! ,;;k/,@Q1R
 
 	N+++{ +t) +::[.z0J
 
 	M*** "ty "{11&+:KaSQQD!!! I 
K66@@@
 
		
 
 
K 
K66<(
 
	 KM# /;== 1LBX3Y
 
 	-... %33+\;
S
 

 	V$$$&&	0?g4?((((EgEgEg I &&0#0 '  I $ *%%%%%%==)**W]]8-D-DDDNNMNNNK"&$(		
 	
 	
 	
 	
 		)T[)))
KK.../////r.   r   r   r  r  bad_words_idsc                 <   | j         r-t          j                                        st	          d          | j        t          j        k    r|                                 t          j	        | j         rdnd          }|
                    |           t          j        d           |
                    |          }|
                    |          }g }t          | j                  D ]}	t          j                    }
|                    ||| j        | j        | j        | j        | j        ||| j        | j        | j        |r|ndd| j        p| j                  }	|                    t          j                    |
z
             |j        d         }dd	lm}  |||          S )
a  Test PyTorch performance of text generation.

    Args:
        args (argparse.Namespace): arguments parsed from command line
        model (Union[GPT2LMHeadModel, T5ForConditionalGeneration]): PyTorch model
        input_ids (torch.Tensor): input_ids
        attention_mask (torch.Tensor): Attention mask
        eos_token_id (int): EOS token ID
        pad_token_id (int): Padding token ID
        bad_words_ids (List[List[int]]): Words shall not be generated.

    Raises:
        RuntimeError: PyTorch with CUDA is not available for --use_gpu

    Returns:
        Dict[str, Any]: A dictionary with string with metric name, and value can be integer or string.
    z=Please install PyTorch with Cuda for testing gpu performance.zcuda:0cpuFNTr   r   r  r  r  rW   r  r  r  r  r  r  r  return_dict_in_generateoutput_scoresr   get_latency_result)rh   torchcudais_availabler   r   r
   rw   halfdevicer  set_grad_enabledr  
total_runstimegenerater  r  r  rW   r  r  r  r  rU   rV   r   r9  benchmark_helperr  )r   r   r   r   r  r  r  r  torch_latency_startr  r  s                r,   test_torch_performancer  
  s   4 | \EJ3355 \Z[[[~***

\dl=((>>F	HHV	5!!!V$$I#&&v..NM4?## 2 2	NN)n.!%!:%%!%!:.#6+8B--d$(6R$:R  
 
" 	TY[[501111#J333333mZ888r.   c                    t          j        | j        t           j                  }t	          | j        d                   D ]J}d}t	          | j        d                   D ]+}| |         |         |k    r|dk    rd||         |<   &|dz  },K|S )Nr   r   r_   )r   onesr9  int32r  )r   r  r   r   abs_posrD  s         r,   create_attention_maskr  \
  s    WY_BH===N9?1%&&  yq)** 	 	A|A,..7a<<'(q!!$$1		
 r.   F	sentences	is_greedyc                 H   | j         dk    sJ t          j        | j        | j                  }d|_        |j        |_        t          j        | j        | j        |j	                  }|g d} ||dd	          }|d
         }|d         }d}|
                    |d          }	d |	D             }	| j        rt                              d|	           ng }	|j        }
|
j	        }|
j	        }|
j        }g }d}| j        s@t#          d           t#          d           |                    ||| j        | j        | j        | j        | j        ||| j        | j        | j        |	r|	ndd| j        p| j                  }t#          d
|           t#          d           t#          d|j                   | j        rt#          d|j                   | j        rt#          d|j                   tA          |j                  D ]E\  }}|!                    |d          }|"                    |           t#          | d|            Ft#          d           t#          d           |r|#                                $                                %                    tL          j'                  tM          j(        | j        gtL          j'                  tM          j(        | j        gtL          j'                  tM          j(        | j        gtL          j)                  d}n#|#                                $                                %                    tL          j'                  tM          j(        | j        gtL          j'                  tM          j(        | j        gtL          j'                  tM          j(        | j        gtL          j'                  tM          j(        | j        gtL          j'                  tM          j(        | j        gtL          j)                  tM          j(        | j        gtL          j)                  d}| j        r6tM          j*        |tL          j'                  }| j        r
|	D ]}d||<   ||d<   | j+        rtY          ||          |d<   |j-        d         }| j.        rAt          /                    d           tM          j*        ||ftL          j'                  }||d <   | j0        rtc          | j2                  j3        4                                }t                              d!|           dd"l5m6} t          /                    d#| d$           |g}tA          |          D ]A\  }}tn          j8        9                    |d%tu          |          z             } |||           Bt                              d&|           | j;        rdS t                              d'           ty          | j2        | j=        | j>                  }t                              d(           |?                    d|          }g }t          | jA                  D ]T}t          jB                    }|?                    d|          }|"                    t          jB                    |z
             Udd)lCmD}  |j-        d         } | ||          }!t#          d*           |d         }"t#          d|"           | j        rt#          d|d+                    | j        rt#          d|d,                    |rf|"j-        \  }}#g }$t          |          D ]I}|!                    |"|         d          }|$"                    |           t#          d-| d.|            Jn|"j-        \  }}%}#g }$t          |          D ]d}t          |%          D ]R}&|!                    |"|         |&         d          }|$"                    |           t#          d-| d/|& d|            Se|r|j        E                    || j        d0          }'t          jG        |"          }(t#          d           t#          d1           t#          |'           t#          |           t#          d           t#          d2           t#          |(           t#          |$           t#          d           ||$k    })t#          d3|)rd4nd5           |)|!d6<   | jH        r%t          | ||||||	          }*t#          d7|*           t#          d8|!           |!S )9a9  Test GPT-2 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r>   r  left)r   r  N)zThe product is releasedzI enjoy walking in the parkzTest best way to investptTreturn_tensorsr   r   r   walk in park)add_prefix_spacec                     g | ]}|gS r5   r5   r   word_ids     r,   r   z"test_gpt_model.<locals>.<listcomp>
      <<<7gY<<<r.   r  2--------------------------------------------------CTest PyTorch model and beam search with huggingface transformers...r  !huggingface transformers outputs:r  r  r  skip_special_tokens: 'Testing beam search with onnxruntime...r   r  r  r   rX   zYUse prefix vocab mask with all ones in ORT, but no corresponding setting for Torch model.r[   test_data_diroutput_test_datazSaving test_data to z/test_data_set_* ...test_data_set_
ORT inputszCreating ort session......zRun ort session......r  ORT outputs:r_   r   batch z sequence: 
 sequence re   Torch Sequences:ORT Sequences:Torch and ORT result is same	differentparityTorch LatencyORT)Jr   r   r  r   r   padding_side	eos_token	pad_tokenr   r  encoderX   r   r   r  r  ri   r  r  r  r  r  rW   r  r  r  r  rU   rV   r  r  r  r  decoder   r  numpyastyper   r  arrayfloat32r  r\   r  r9  r[   r   rl   r   r   r   r  bert_test_datar  rs   rt   rr   rq   rj   r   rh   rf   runr  r  r  r  r  r  r  
LongTensorrk   r  )+r   r  r  	tokenizerr   r  r   r   	bad_wordsr  r  r  r  r  torch_decoded_sequencesbeam_outputsr   sequencedecoded_sequencerX   bad_word_idr  r[   r  r  
all_inputsdirr   resultlatencyr  r  r  r   r  r  ort_decoded_sequencesnum_sequencesrD  torch_sequencesort_sequencesis_sametorch_latency_outputs+                                              r,   test_gpt_modelr#  h
  s	    ?f$$$$-d.EQUQ_```I#I#-I+.+  E 
 
 
	 YytDDDF{#I,-NI$$Y$FFM<<m<<<M _m4444\F&L&L"J L .hSTTT~~)n.!%!:%%!%!:.#6+8B--d$(6R$:R & 
 
" 	k9%%%1222k<1222' 	E$l&CDDD# 	1(L/000$\%;<< 	. 	.KAx(//d/SS#**+;<<<Q,,*,,----	(OOO	
3444 
"..0077AA(DO#4BHEEE(DO#4BHEEE"$(D,C+DBJ"W"W"W	
 
 #..0077AA(DO#4BHEEE(DO#4BHEEE4>"2"(CCC$&Hd.G-HPRPX$Y$Y$Y h(;'<BJOOO"$(D,C+DBJ"W"W"W
 
  *Wj:::
? 	,, , ,*+
;'')|! R#8L#Q#Q #J 8opppGZ$<BHMMM&7"# 
*T[))099;;_m444333333N=NNNOOOX
":.. 	* 	*IAv',,}.>Q.GHHCS&))))
LLv&&& 
LL-...$T[$,@XYYK
LL()))__T6**F G4?## , ,	OOD&))ty{{U*++++333333#J44F	.q	I	+y!!!# - &),,, #hq	""" E#,? Z "z"" 	= 	=A(//	!RV/WW!(()9:::;1;;)9;;<<<<	=
 3<//]J "z"" 	E 	EA=)) E E#,#3#3IaLOY]#3#^#^ %,,-=>>>CqCCACC1ACCDDDDE
  #&088TE^`bcc(33h !!!o%&&&hm#$$$h)-BB(G*L&&MMM"x 
55 
  
 	o3444	%Mr.   c                 J   | j         dv sJ | j        rt                              d           dS t	          j        | j        | j                  }d|_        | j         dk    r!t          j        | j        | j                  }n t          j        | j        | j                  }|ddg} ||d	d
          }|d         }|d         }d}|                    |          dd         }d |D             }| j        rt                              d|           ng }|j        }	|	j        }
|	j        }|	j        }t                              d|
 d| d|            g }| j        s@t%          d           t%          d           |                    ||| j        | j        | j        | j        | j        |
|| j        | j        | j        |r|ndd
| j        p| j                  }t%          d|           t%          d           t%          d|j                   | j        rt%          d|j                   | j        rt%          d|j                    tC          |j                  D ]E\  }}|"                    |d
          }|#                    |           t%          | d|            Ft%          d           t%          d           tI          j%        |tH          j&                  }| j        r
|D ]}d ||<   |'                                (                                )                    tH          j&                  tI          j*        | j        gtH          j&                  tI          j*        | j        gtH          j&                  tI          j*        | j        gtH          j&                  tI          j*        | j        gtH          j&                  tI          j*        | j        gtH          j+                  tI          j*        | j        gtH          j+                  d!}| j        r||d"<   | j,        rt[          ||          |d<   | j.        rt_          | j0                  j1        2                                }t                              d#|           d d$l3m4} |g}tC          |          D ]A\  }}tj          j6        7                    |d%tq          |          z             } |||           Bt                              d&|           ts          | j0        | j:        | j;                  }g }ty          | j=                  D ]T}t}          j>                    }|?                    d|          }|#                    t}          j>                    |z
             U|j@        d          }d d'lAmB}  |||          }t%          d(           |d          } t%          d|            | j        rt%          d|d)                    | j        rt%          d|d*                    | j@        \  }}!}"g }#ty          |          D ]d}ty          |!          D ]R}$|"                    | |         |$         d
          }|##                    |           t%          d+| d,|$ d|            Se| j        s|j        C                    || j        d          }%t          jE        |           }&t%          d           t%          d-           t%          |%           t%          |           t%          d           t%          d.           t%          |&           t%          |#           t%          d           ||#k    }'t%          d/|'rd0nd1           |'|d2<   | jF        r%t          | ||||
||          }(t%          d3|(           t%          d4|           |S )5a=  Test T5 or MT5 model

    Args:
        args (argparse.Namespace): arguments parsed from command line
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  zLSkipping parity test as prefix vocab mask is not implemented by Hugging FaceNr  r  r?   z4translate English to French: The product is releasedzsummarize: research continues to show that pets bring real health benefits to their owners. Having a dog around can lead to lower levels of stress for both adults and kids.r  Tr  r   r   r  re   c                     g | ]}|gS r5   r5   r  s     r,   r   z!test_t5_model.<locals>.<listcomp>h  r  r.   r  zeos_token_id:z, pad_token_id:z, vocab_size:r  r  r  r  r  r  r  r  r  r  r   r   r  rX   r  r  r  r  r  r  r_   r   r  r  r  r  r   r  r  r  r  r  )Hr   r[   r   r   r   r  r   r   r  r   r   r	  rX   r  r  r  r  ri   r  r  r  r  r  rW   r  r  r  r  rU   rV   r  r  r  r  r
  r   r   r  r  r  r  r  r  r  r\   r  rl   r   r   r   r  r  r  rs   rt   rr   rq   r   rh   rf   r  r  r  r  r9  r  r  r  r  r  rk   r  ))r   r  r  r   r  r   r   r  r  r  r  r  r  r  r  r   r  r  rX   r  r  r  r  r  r   r  r  r  r  r  r  r   r  r  r  r  rD  r  r   r!  r"  s)                                            r,   test_t5_modelr&  ;  s    ?m++++ cdddt+D,Ct~^^^I#I$*:#n
 
 

 ,;#n
 
 
 B {
	 YytDDDF{#I,-NI$$Y//4M<<m<<<M _m4444\F&L&L"J
LLeeeleeYceefff  .hSTTT~~)n.!%!:%%!%!:.#6+8B--d$(6R$:R & 
 
$ 	k9%%%1222k<1222' 	E$l&CDDD# 	1(L/000$\%;<< 	. 	.KAx(//d/SS#**+;<<<Q,,*,,----	(OOO	
3444*RX666J (( 	( 	(K&'J{## ]]__**,,33BH==h0AAAh0AAAXt~.bh??? "$*C)DBH U U U(D$7#8
KKK h(?'@
SSS F  *)|! R#8L#Q#Q  *T[))099;;_m444333333X
":.. 	* 	*IAv',,}.>Q.GHHCS&))))
LLv&&&$T[$,@XYYK G4?## , ,	v..ty{{U*++++#J33333344F	.q	I	+y!!!# - &),,, #hq	""".7o+Z
: A A}%% 	A 	AA(//	!QUY/ZZ!(()9:::?1????-=??@@@@	A
  #&088TE^`bcc(33h !!!o%&&&hm#$$$h)-BB(G*L&&MMM"x 
55 
  
 	o3444	%Mr.   c                 ^   t          |           }t          |j                   |j        dv r|j        r;t
          j                            |j                  st          d|j                   |j	        r;t
          j                            |j	                  st          d|j	                   |j        r|j	        r|j	        r|j        st          d          |j
        dk    o
|j        dk    }|j        dk    r|r|j        dk    r[|j        dk     rPt          |t          j                   t                               d	           |j        d
k    s|j        s|j        rdS n*t          |t          j                   nt          |           t                               d           |j        dv rt+          ||          }nt-          |||          }|rU|j        r,t                               d|j         d|j         d           n"t                               d|j                    |S )a/  Main entry function

    Args:
        argv (Optional[List[str]], optional): _description_. Defaults to None.
        sentences (Optional[List[str]], optional): input text. Defaults to None.

    Raises:
        ValueError: Path does not exist: --encoder_decoder_init_onnx
        ValueError: Path does not exist: --decoder_onnx
        ValueError: --decoder_onnx and --encoder_decoder_init_onnx are not used together for T5

    Returns:
        Union[Dict[str, Any], None]: A dictionary with string with metric name, and value can be integer or string.
    r  z1Path does not exist: --encoder_decoder_init_onnx z$Path does not exist: --decoder_onnx zB--decoder_onnx shall use together with --encoder_decoder_init_onnxr_   r>   rd   rb   zThe test for gpt2_sampling onnx model is limited to non-custom model with small top_p(e.g <=0.01) value. The result should be the same as gpt2 greedy search.g{Gz?Nzstart testing model...)r  )r  r  zOutput files: r:   z.datazOutput file: )r   r   rH   r   r   rs   rt   r  r  r   r  r  r  r  r$   r4   r   r   r  r^   r3   r&  r#  rP   r   )r6   r  r   r  r  s        r,   r   r     sW     4  D-'') 	s"'..A_2`2` 	sqQUQoqqrrr 	YRW^^D4E%F%F 	YWDDUWWXXX* 	c43D 	c	c&*&D	c abbb!#F(AQ(FI&  Y :
S 0 0$T>+BCCCKK p   zD  DK 49  ! %T>+FGGGG &&&
KK()))-''ty999	YOOO 7( 	7KKJJJJJJKKKKKK555666Mr.   __main__r)   )T)r)  r*  NN)r*  )r   r_   re   )NFr  )`__doc__rm   loggingr   rs   r  enumr   pathlibr   typingr   r   r   r   r	   r  r   r   r  r  r
   r   fusion_utilsr   r   r   r   rI  r   transformersr   r   r   r   r   r   r   r   onnxruntimer   r   r   r   4onnxruntime.transformers.models.gpt2.convert_to_onnxr   r   0onnxruntime.transformers.models.gpt2.gpt2_helperr   2onnxruntime.transformers.models.t5.convert_to_onnxr   r   ,onnxruntime.transformers.models.t5.t5_helperr    r!   	getLoggerr   r$   rq   	Namespacer   r   r   boolr   r   r   r  r!  r(  rx   dictrF  rQ  rU  rf  rm  rv  r  r  r  r	  r$  r9  rP  rV  r|  r  r2   r  Tensorr  r  r#  r&  r/   r5   r.   r,   <module>r:     s  
% %N    				              3 3 3 3 3 3 3 3 3 3 3 3 3 3       4 4 4 4 4 4 4 4 $ $ $ $ $ $ 4 4 4 4 4 4 4 4 4 4            	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 j i i i i i i i i i i i ] ] ] ] ] ] S S S S S S j j j j j j d d d d d d d d		2		    T   B B(49- B9K B B B BJ*)x) *) *) *) *)Z!X' ! ! ! !:O Os Od O O O O$K KC K4 K[_ K K K K\3  D Ue    B5 5I 5 5 5 5pIodo Io) Io Io Io IoXAa4? Aay Aa Aa Aa AaN #'+'+g! g!g!g! g! 	g!
 tng! tng! g! g! g!T: j    (   
+   >"# "# "#J  i i i-J - - - -`M
M&*M>BM	M M M M`93* 93 93 93 93z fh` ``!$`47`IL`_b` ` ` `F$f $f $f $f $fNV* V V V Vr8 8 8 8 8v # ae    : [_I II47ISWI	I I I IX"0 "0 "0J ZhYr q0 q08#5 q0 q0 q0 q0 q0h?9

?9"<<=?9 |?9 L	?9
 ?9 ?9 S	??9 
#s(^?9 ?9 ?9 ?9D	 	 	P P+ Pc8K P_c P P P Pfz z* zxS	7J z z z zz8 8xS	" 8htCy6I 8 8 8 8v zDFFFFF r.   