
    g@                     |   d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlm	c m
Z d dlmZmZmZ d dlmZ ddlmZ ddlmZ d	d
lmZ d	dlmZmZ  e            rd dlmZmZmZm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z. d dl/mZ0 d dl/m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z> d dl?m@Z@mAZAmBZBmCZCmDZDmEZE d dlFmGZGmHZHmIZImJZJ d dlKmLZL d dlMmNZN d dlOmPZPmQZQ d dlRmSZSmTZT d dlUmVZV d dlWmXZXmYZYmZZZm[Z[m\Z\m]Z] d dl^m_Z_m`Z`maZambZb d=d!Zcd" Zd G d# d$          Zed% Zfd& Zg G d' d(e          Zhd) Zi G d* d+          Zj G d, d-e          Zkd. Zl G d/ d0e          Zm G d1 d2em          Zn G d3 d4em          Zo G d5 d6em          Zpd7 Zqdi fd8Zr G d9 d:ej	        js                  Ztd; Zud< ZvdS )>    N)ABC)partial)BCEWithLogitsLossCrossEntropyLossMSELoss)DistributedDataParallel   )AcceleratedOptimizer)AcceleratedScheduler   )is_megatron_lm_available)recursively_applysend_to_device)get_argsget_num_microbatchesget_tensorboard_writerget_tokenizerprint_rank_last)_add_data_args_add_validation_args!core_transformer_config_from_args
parse_argsvalidate_args)load_args_from_checkpointload_checkpointsave_checkpoint)mputensor_parallel)finalize_model_grads)	ModelType)get_tensor_model_parallel_group"get_tensor_model_parallel_src_rank)get_forward_backward_func)get_model_config)build_train_valid_test_datasets)set_global_variables)_compile_dependencies_init_autoresume_initialize_distributed_set_random_seedset_jit_fusion_optionswrite_args_to_tensorboard)	BertModelFloat16ModuleGPTModelT5Model)Classification)get_megatron_optimizer)broadcast_int_listbroadcast_tensor)%beam_search_and_return_on_first_stage/generate_tokens_probs_and_return_on_first_stage)_vocab_size_with_padding)%build_train_valid_test_data_iteratorsget_optimizer_param_schedulernum_floating_point_operationssetup_model_and_optimizer
train_steptraining_log))average_losses_across_data_parallel_groupcalc_params_l2_normget_ltor_masks_and_position_idsunwrap_modelTc           	         t                      }|j        rdnd}|j        dk    r*t          d|j         d| d           t          d           t          |          }|j        dk    rG|j        r&|j        rd	nd}t          |||j        d
| |          }nst          ||j	        d	| |          }nY|j        dk    rt          |dd
| |          }n9|j        dk    rt          |dd
| |||          }nt          d|j                   |S )zBuild the model.zpre-trainingzfine-tuningr   z	Building z model in the z mode.zThe Megatron LM model weights are initialized at random in `accelerator.prepare`. Please use `accelerator.load_checkpoint` to load a pre-trained checkpoint matching the distributed setup.bertr	   T)confignum_tokentypesadd_binary_headparallel_outputpre_processpost_process)rD   num_classesrE   rH   rI   gpt)rD   rE   rG   rH   rI   t5)rD   rE   rG   rH   rI   add_encoderadd_decoderUnsupported model type: )r   pretraining_flagrankprintmodel_type_namer   bert_binary_headr-   r1   
num_labelsr/   r0   
ValueError)	rH   rI   rM   rN   argsmoderD   rE   models	            X/var/www/html/ai-engine/env/lib/python3.11/site-packages/accelerate/utils/megatron_lm.pymodel_provider_funcr[   [   s   ::D!2E>>DyA~~J$.JJdJJJKKKx	
 	
 	
 /t44Fv%%  	"&"7>QQQN- $ 5 $')  EE # O ')  EE 
		&	& #%
 
 
 
		%	% #%##
 
 
 JD4HJJKKKL    c                 h   |                      d           t                      }| j        j        j        s| j        j        j        t          d          | j        j        j        }| j        j                            |          }t          | |          }t          | |d           }nst          j
        }|j        dk    rt          j        }t          }| j        j        j        | j        j        j        }t          |||j        |j        |j                  \  }}}t%          |          |_        |||fS )Nz#Preparing model optimizer schedulerzaYou must provide a `custom_model_provider_function` when using a `custom_prepare_model_function`.)	schedulerrL   )no_wd_decay_condscale_lr_condlr_mult)rR   r   statemegatron_lm_plugincustom_prepare_model_functioncustom_model_provider_functionrV   prepare_optimizerprepare_schedulerr    encoder_or_decoderrS   encoder_and_decoderr[   r;   r_   r`   ra   len	model_len)acceleratorrW   custom_model_provider_funcrY   	optimizerr^   
model_typemodel_provider_func_s           rZ   !prepare_model_optimizer_schedulerrq      s3   ;<<<::D+IU/NVs   &1%6%I%h"!4RRSmnn%k599	%k9MMM		1
4''"6J2/NZ#.#4#G#f (A !2,L)
 )
 )
%	9 ZZDN)Y&&r\   c                   *    e Zd ZdZd Zd Zd Zd ZdS )MegatronLMDummyDataLoaderz
    Dummy dataloader presents model parameters or param groups, this is primarily used to follow conventional training

    Args:
        **dataset_kwargs: Megatron data arguments.
    c                    t          j                    }t          |          }t          |          }|                                }t          |d                   | _        | j                            |           d| j        d<   d S )Nr   Tmegatron_dataset_flag)argparseArgumentParserr   r   parse_known_argsvarsdataset_argsupdate)selfdataset_kwargsparser	data_argss       rZ   __init__z"MegatronLMDummyDataLoader.__init__   sz    (**''%f--++--	 1..  000591222r\   c                     t                      }| j                                        D ]H\  }}t          ||d          }||k    rt	          d| d| d| d|            t          |||           Id S )N z<WARNING: MegatronLMDummyDataLoader overriding arguments for : with )r   rz   itemsgetattrrR   setattr)r|   rW   keyvalue	old_values        rZ   set_megatron_data_argsz0MegatronLMDummyDataLoader.set_megatron_data_args   s    zz+1133 	& 	&JCc2..IE!!<< <&< <.1< <49< <   D#u%%%%	& 	&r\   c                 ,   d }|j         j        j        |j         j        j        S 	 t                      }|j        dk    rddlm} d|_        |S |j        dk    rddlm} d|_        |S |j        dk    rddl	m} d|_        |S n# t          $ r Y nw xY w|S )Nc                    t                      }t          |j        t          t          f          r|j        n|j        g|j        | |j        d}|j        dk    r#|                    |j	        |j
        d           nn|j        dk    r|                    d|j	        i           nF|j        dk    r$|                    |j        |j        dd           nt          d|j                   t          d	i |\  }}}|||fS )
z&Build train, valid, and test datasets.)data_prefixsplits_stringtrain_valid_test_num_samplesseedrC   )max_seq_lengthbinary_headrK   r   rL   )r   max_seq_length_decdataset_typerO    )r   
isinstance	data_pathlisttuplesplitr   rS   r{   
seq_lengthrT   encoder_seq_lengthdecoder_seq_lengthrV   r%   )train_val_test_num_samplesrW   rz   train_dsvalid_dstest_dss         rZ   "train_valid_test_datasets_providerzlMegatronLMDummyDataLoader.get_train_valid_test_datasets_provider.<locals>.train_valid_test_datasets_provider   sN   ::D1;DNTSXM1Z1Zpt~~aeao`p!%0J		 L #v--##*./'+'<     %..##($/   
 %--##*.*A.2.E(,     !!RD<P!R!RSSS*I*Y*YL*Y*Y'HhXw..r\   rC   r   )r   TrK   rL   )rb   rc   *custom_megatron_datasets_provider_functionr   rS   pretrain_bertr   is_distributedpretrain_gptpretrain_t5ImportError)r|   rl   r   rW   s       rZ   &get_train_valid_test_datasets_providerz@MegatronLMDummyDataLoader.get_train_valid_test_datasets_provider   s    !	/ !	/ !	/F /Zf$7bb	::D#v--LLLLLLDH2A99%..KKKKKKDH2A99%--JJJJJJDH2A99	 .
  	 	 	D	11s   'B B )B 
BBc                    t                      }|                     |          }|j        g }g }g }t          t	          |dd                    D ]v}t          j        |           t          |          }|                    |d                    |                    |d                    |                    |d                    wnt          |          \  }}}|||fS )Nrk   r   r   r	   )	r   r   $virtual_pipeline_model_parallel_sizeranger   r   (set_virtual_pipeline_model_parallel_rankr8   append)	r|   rl   rW   !train_valid_test_dataset_providertrain_data_iteratorvalid_data_iteratortest_data_iteratori	iteratorss	            rZ   r8   z?MegatronLMDummyDataLoader.build_train_valid_test_data_iterators  s    zz,0,W,WXc,d,d)4@"$"$!#74a8899 8 8<Q???ABcdd	#**9Q<888#**9Q<888")))A,77778 Lq1L LH!46H #$79KKKr\   N)__name__
__module____qualname____doc__r   r   r   r8   r   r\   rZ   rs   rs      s`         : : :	& 	& 	&:2 :2 :2xL L L L Lr\   rs   c                     G d d          }|d u }t          j        |t           j        | j                  }t           j                            |t                      t                                 |s|r
 |            S |S )Nc                       e Zd Zd Zd ZdS )?_handle_megatron_data_iterator.<locals>.DummyMegatronDataloaderc                     | S Nr   r|   s    rZ   __iter__zH_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__iter__  s    Kr\   c                     i S r   r   r   s    rZ   __next__zH_handle_megatron_data_iterator.<locals>.DummyMegatronDataloader.__next__"  s    Ir\   N)r   r   r   r   r   r   r\   rZ   DummyMegatronDataloaderr     s2        	 	 		 	 	 	 	r\   r   dtypedevicegroup)torchtensorboolr   distributed	broadcastr"   r!   )rl   data_iteratorr   is_data_iterator_emptyis_src_data_iterator_emptys        rZ   _handle_megatron_data_iteratorr     s            +d2!&.DEJ_j_q!r!r!r	"$F$H$HPoPqPq      & )*@ )&&(((r\   c           
        	 |                      d           t                      }|j        sddlm	m} |j        |j        z  }	fd	D             }|d         Ot          |d         t          j
        j        j                  r||d         _        n|d= |d= |d= ||d         _        n|d= ||d<   t          j
        j        j        j        fi | || j        t#          j                    t#          j                    d	d
| j                                        | j                  S |j        |j        \  |_        |_        |_        nd\  |_        |_        |_        |j        |j        z  |_                            |           \  }}}|j        |j        z  |_        t9          | |          }t9          | |          }t9          | |          }|||fS )NzPreparing dataloaderr	   )_PYTORCH_DATALOADER_KWARGSprepare_data_loaderc           	      @    i | ]}|t          ||                   S r   )r   ).0kr   
dataloaders     rZ   
<dictcomp>z'prepare_data_loader.<locals>.<dictcomp>6  s-    oooq!WZ,Fq,IJJooor\   
batch_sizesamplershufflebatch_samplerFT)num_processesprocess_indexsplit_batchesput_on_device	rng_typesdispatch_batches)r   r   r   )rl   r   )rR   r   ru   data_loaderr   r   micro_batch_sizenum_micro_batchesr   r   utilsdataBatchSamplerr   
DataLoaderdatasetr   r   get_data_parallel_world_sizeget_data_parallel_rankr   copyr   consumed_samplesconsumed_train_samplesconsumed_valid_samplesconsumed_test_samplesr8   r   )
rl   r   rW   r   r   kwargsr   r   r   r   s
    `       @rZ   r   r   /  sN   ,---::D% ;LQQQQQQQQ043IIoooooTnooo,'&+U[-=-JKK F/?y!,,9%9%<(5E'22'#3F< [%01CNNvNN
 #":<<466!+0022(9	
 	
 	
 		
  ,
 %	++** dk`D')DdF` $ 58N N <<[II		
 $ 59O O<#3F
 
 
 =#3F
 
 
 <cuvvv"$79KKKr\   c                   B     e Zd Z fdZddZd Zed             Z xZS )MegatronLMOptimizerWrapperc                 P    t                                          |dd            d S )NF)device_placementscalersuperr   )r|   rn   	__class__s     rZ   r   z#MegatronLMOptimizerWrapper.__init__r  s(    U4HHHHHr\   Nc                     d S r   r   )r|   set_to_nones     rZ   	zero_gradz$MegatronLMOptimizerWrapper.zero_gradu      r\   c                     d S r   r   r   s    rZ   stepzMegatronLMOptimizerWrapper.stepx  r   r\   c                     | j         j        S )zTWhether or not the optimizer step was done, or skipped because of gradient overflow.)rn   skipped_iterr   s    rZ   step_was_skippedz+MegatronLMOptimizerWrapper.step_was_skipped{  s     ~**r\   r   )	r   r   r   r   r   r  propertyr  __classcell__r   s   @rZ   r   r   q  s{        I I I I I      + + X+ + + + +r\   r   c                     |                      d           t                      }t          ||j        |j        |j                  S )NzPreparing optimizer)rR   r   r2   r_   r`   ra   )rl   rY   rW   s      rZ   rf   rf     s>    +,,,::D!%)>@RTXT`aaar\   c                       e Zd ZdZddZdS )MegatronLMDummySchedulera  
    Dummy scheduler presents model parameters or param groups, this is primarily used to follow conventional training
    loop when scheduler config is specified in the deepspeed config file.

    Args:
        optimizer (`torch.optim.optimizer.Optimizer`):
            The optimizer to wrap.
        total_num_steps (int):
            Total number of steps.
        warmup_num_steps (int):
            Number of steps for warmup.
        **kwargs (additional keyword arguments, *optional*):
            Other arguments.
    Nr   c                 >    || _         || _        || _        || _        d S r   )rn   total_num_stepswarmup_num_stepsr   )r|   rn   r  r  r   s        rZ   r   z!MegatronLMDummyScheduler.__init__  s$    ". 0r\   Nr   )r   r   r   r   r   r   r\   rZ   r
  r
    s2              r\   r
  c                   $     e Zd Z fdZd Z xZS )MegatronLMSchedulerWrapperc                 L    t                                          ||           d S r   r   )r|   r^   
optimizersr   s      rZ   r   z#MegatronLMSchedulerWrapper.__init__  s#    J/////r\   c                     d S r   r   )r|   rW   r   s      rZ   r  zMegatronLMSchedulerWrapper.step  s    r\   )r   r   r   r   r  r  r  s   @rZ   r  r    sG        0 0 0 0 0      r\   r  c                 N    |                      d           t          |          }|S )NzPreparing scheduler)rR   r9   )rl   rn   r^   s      rZ   rg   rg     s*    +,,,-i88Ir\   c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )AbstractTrainStepz;Abstract class for batching, forward pass and loss handler.c                 V    t                                                       || _        d S r   )r   r   name)r|   r  r   s     rZ   r   zAbstractTrainStep.__init__  s$    			r\   c                     d S r   r   )r|   rl   ru   s      rZ   get_batch_funcz AbstractTrainStep.get_batch_func  r   r\   c                     d S r   r   r   s    rZ   get_forward_step_funcz'AbstractTrainStep.get_forward_step_func  r   r\   c                     d S r   r   )r|   rl   s     rZ   get_loss_funczAbstractTrainStep.get_loss_func  r   r\   )	r   r   r   r   r   r  r  r  r  r  s   @rZ   r  r    sk        EE              r\   r  c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )BertTrainStepzg
    Bert train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                 Z   t                                          d           |                     ||j                  | _        |                     ||j        |j                  | _        | 	                    |j        |j
                  | _        |j        s	d | _        d S ddlm} || _        d S )Nr   r   )SequenceClassifierOutput)r   r   r  ru   	get_batchr  rP   rU   	loss_funcr  rT   forward_stepmodel_return_dictmodel_output_classtransformers.modeling_outputsr"  )r|   rl   rW   r"  r   s       rZ   r   zBertTrainStep.__init__  s    ))),,[$:TUU++K9NPTP_`` 66t7LdNcdd% 	?&*D###NNNNNN&>D###r\   c                     d }d }|j         j        j        |j         j        j        S |r	 ddlm} |S # t
          $ r Y nw xY w|S |S )Nc                    g d}t           j        }| t          |           }nd}t          j        |||          }|d                                         }|d                                         }|d                                         }|d                                         }|d                                         }	|d                                         }
|||||	|
fS )	Build the batch.)texttypeslabels	is_random	loss_maskpadding_maskNr,  r-  r/  r0  r.  r1  r   int64nextr   broadcast_datalongfloat)r   keysdatatyper   data_btokensr-  sentence_orderr0  	lm_labelsr1  s              rZ   get_batch_megatronz8BertTrainStep.get_batch_func.<locals>.get_batch_megatron  s     YXXD{H (M**$3D$IIF F^((**F7O((**E#K05577N{+1133Ix(--//I!.16688L5.)YTTr\   c                    t          |           }t          |t          j                                                  }|d                                         }|d                                         }d|v r|d                                         }nd}d|v rD|d                                         }|d         dk                        t          j                  }nd}d}d|v r|d                                         }nd}||||||fS )r+  	input_idsattention_masktoken_type_idsNr.  next_sentence_label)r4  r   r   cudacurrent_devicer6  tor7  )r   r   r;  r1  r-  r=  r0  r<  s           rZ   get_batch_transformerz;BertTrainStep.get_batch_func.<locals>.get_batch_transformer  s
   &&D!$
(A(A(C(CDDD +&++--F 016688L4''-.33554 N//11	!(^t377DD		 	 	$,,!%&;!<!A!A!C!C!%5.)YTTr\   r   r#  )rb   rc   custom_get_batch_functionr   r#  r   r|   rl   ru   r>  rH  r#  s         rZ   r  zBertTrainStep.get_batch_func  s    	U 	U 	U0	U 	U 	U2 /IU$7QQ  
	)333333     %%((   4 
A Ac                 h     d } fd}|j         j        j        |j         j        j        S |r|S |S )Nc                 Z   |\  }}|                                 }|                                  } t          j        |                    d          |                     d          z            |                                 z  }|t          j        |                    dd                                           |                    d          d          }|                                 }||z   }t          ||g          }||d         |d         dfS |}t          |g          }|d|d         ifS )Nr	   )ignore_indexr   r   )lm losszsop lossrQ  )r7  r   sumviewreshapeFcross_entropyr>   )	r0  r<  output_tensorlm_loss_
sop_logitslm_losssop_losslossaveraged_lossess	            rZ   loss_func_pretrainz7BertTrainStep.get_loss_func.<locals>.loss_func_pretrain  s    #0 Hj~~''H!))Iib 1 1I4E4Eb4I4I IJJY]]__\G%?:??2q+A+A+G+G+I+I>K^K^_aKbKbqsttt#>>++)"KWV^L_"`"`);YZI[\\\\ "KWI"V"Vi);<<<r\   c                    dk    rAt                      } ||                    d          |                     d                    }nj        dk    ra| j        t          j        t          j        fv rBt                      } ||                    d          |                     d                    }nt                      } |||           }t          |g          }|d|d         ifS )Nr   rO  r\  r   )
r   rS  rU   r   r   r6  intr   r   r>   )r.  logitsloss_fctr\  r]  rU   r|   s        rZ   loss_func_finetunez7BertTrainStep.get_loss_func.<locals>.loss_func_finetune&  s    Q"99xBRAA1$$&,5:uy:Q*Q*Q+--xB
 ; ;V[[__MM,..x//GOOO&/!"4555r\   rb   rc   custom_loss_function)r|   rl   rP   rU   r^  rc  s   `  `  rZ   r  zBertTrainStep.get_loss_func  se    	= 	= 	=&	6 	6 	6 	6 	6 	6 /DP$7LL 	&%%%%r\   c                       fd}|S )Nc                                          |           \  }}}}}}
sd}r' |||||          }|t          j        ||          fS  ||||          }	|	t          j        |          fS )Forward step.Ntokentype_idsr=  )rj  r#  r   r$  )r   rY   r;  r-  r<  r0  r.  r1  rW  ra  rT   rP   r|   s             rZ   r%  z9BertTrainStep.get_forward_step_func.<locals>.forward_step<  s    MQ^^\iMjMjJFE>9fl#  ? %fl%[a b b b$gdni&X&XXXv|5IIIwt~v>>>>r\   r   )r|   rP   rT   r%  s   ``` rZ   r  z#BertTrainStep.get_forward_step_func;  s0    	? 	? 	? 	? 	? 	? 	? r\   	r   r   r   r   r   r  r  r  r  r  s   @rZ   r   r     sq         
? 
? 
? 
? 
?>) >) >)@'& '& '&R      r\   r   c                   4     e Zd ZdZ fdZd Zd Zd Z xZS )GPTTrainStepzf
    GPT train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                    t                                          d           |                     ||j                  | _        |                     |          | _        |                                 | _        |j	        dz
  | _
        |j        t                      }|j        | _
        |j        | _        |j        | _        |j        | _        |j        s	d | _        d S ddlm} || _        d S )Nrn  r   r   )!CausalLMOutputWithCrossAttentions)r   r   r  ru   r#  r  r$  r  r%  padded_vocab_size	eod_token
vocab_filer   eodreset_position_idsreset_attention_maskeod_mask_lossr&  r'  r(  rp  )r|   rl   rW   	tokenizerrp  r   s        rZ   r   zGPTTrainStep.__init__T  s    (((,,[$:TUU++K88 6688/!3?&%I&]DN"&"9$($=!!/% 	H&*D###WWWWWW&GD###r\   c                       fd} fd}|j         j        j        |j         j        j        S |r	 ddlm} |S # t
          $ r Y nw xY w|S |S )Nc                    dg}t           j        }| t          |           }nd}t          j        |||          }|d                                         }|ddddf                                         }|ddddf                                         }t          |j        j	        j
        j                  \  }}	}
|||	||
fS )zGenerate a batchr,  Nr   rO  )r   r3  r4  r   r5  r6  
contiguousr@   rr  ru  rv  rw  )r   r8  r9  r   r:  tokens_r.  r;  rA  r0  position_idsr|   s              rZ   r>  z7GPTTrainStep.get_batch_func.<locals>.get_batch_megatronh  s     8D{H (M**$3D$IIF Vn))++GQQQU^..00FQQQV_//11F 7V(?AZ\`\n7 73NI| 69nlJJr\   c                 8   t          |           }d|d         i}t          |t          j                                                  }|d                                         }t          j        |j        d         df|j        |j	                  	j
        z   }t          j        ||gd          }|d d dd f                                         }|d d d df                                         }t          |	j
        	j        	j        d          \  }}}|||||fS )Nr@  r   r   r   dimrO  T)r4  r   r   rE  rF  r6  zerosshaper   r   rr  concatr{  r@   ru  rv  )
r   r   r|  paddingr.  r;  rA  r0  r}  r|   s
            rZ   rH  z:GPTTrainStep.get_batch_func.<locals>.get_batch_transformer  s   &&Dk!23D!$
(A(A(C(CDDD;',,..Gk7=#3Q"7w}U\UcdddgkguuGlGW#51===GQQQU^..00FQQQV_//11F6U(?AZ\`7 73NI| 69nlJJr\   r   rI  )rb   rc   rJ  r   r#  r   rK  s   `     rZ   r  zGPTTrainStep.get_batch_funcg  s    	K 	K 	K 	K 	K2	K 	K 	K 	K 	K  /IU$7QQ  
	)222222     %%((s   9 
AAc                 r    t                      fd}|j        j        j        |j        j        j        S |S )Nc                    j         r|\  }}n|}|                                }|                     d                                          } j        dk    rt	          j        t	          j        |                    d          | z                                d          |                                                     d          g          }t          j                            |t          j
                               |d         |d         z  }n?t	          j        |                    d          | z            |                                 z  }j        rxt          j                                        }|                                rFJ d| dt          j                                         dt!          j                    d                      t%          |g          }d|d         i}j         r|                    d	|i           ||fS )
NrO  r   r   r   zRank z7: found NaN in local forward loss calculation. Device: z, node: rQ  ra  )return_logitsr7  rS  context_parallel_sizer   catrR  r   
all_reducer   get_context_parallel_groupcheck_for_nan_in_loss_and_gradget_rankisnanrE  rF  osunamer>   r{   )	r0  rW  lossesra  r\  global_rankaveraged_lossoutput_dictrW   s	           rZ   r$  z-GPTTrainStep.get_loss_func.<locals>.loss_func  s   ! '!.&\\^^F!r**0022I)A--y%)FKKOOi,G"H"H"M"Ma"P"PR[R_R_RaRaRfRfghRiRi!jkk!,,T9W9Y9Y,ZZZAwa(yR9!<==	O 2 #/88::::<<  TK T T$z88::T TDFHJJqMT T   FtfMMM$mA&67K! 7""Hf#5666$$r\   )r   rb   rc   re  )r|   rl   r$  rW   s      @rZ   r  zGPTTrainStep.get_loss_func  sK    zz	% 	% 	% 	% 	%< /DP$7LLr\   c                       fd}|S )Nc                                          |           \  }}}}} |||||          }|t          j        |          fS )rh  )r.  rk  )	r   rY   r;  r.  r0  rA  r}  rW  r|   s	           rZ   r%  z8GPTTrainStep.get_forward_step_func.<locals>.forward_step  sS     GKnnUbFcFcCFFI~|!E&,vVVVM '$.)"D"DDDr\   r   r|   r%  s   ` rZ   r  z"GPTTrainStep.get_forward_step_func  s)    	E 	E 	E 	E 	E r\   rl  r  s   @rZ   rn  rn  L  sv         H H H H H&6) 6) 6)p# # #J	 	 	 	 	 	 	r\   rn  c                   v     e Zd ZdZ fdZed             Zed             Zed             Zd Z	d Z
d Z xZS )	T5TrainStepze
    T5 train step class.

    Args:
        args (`argparse.Namespace`): Megatron-LM arguments.
    c                 *   t                                          d           |                     ||j                  | _        |                     |          | _        |                                 | _        |j	        s	d | _
        d S ddlm} || _
        d S )Nr  r   )Seq2SeqLMOutput)r   r   r  ru   r#  r  r$  r  r%  r&  r'  r(  r  )r|   rl   rW   r  r   s       rZ   r   zT5TrainStep.__init__  s    ''',,[$:TUU++K88 6688% 	6&*D###EEEEEE&5D###r\   c                 p    |                      d          }|                      d          }||z  }|dk     }|S )Nr   r	         ?)	unsqueeze)rA  attention_mask_b1sattention_mask_bs1attention_mask_bssextended_attention_masks        rZ   attn_mask_postprocessz!T5TrainStep.attn_mask_postprocess  sI     ,55a88+55a88/2DD"4s":&&r\   c                 h    t          j        t          j        d| | f|                    }|dk     }|S Nr   r   r  )r   trilones)r   r   rA  s      rZ   get_decoder_maskzT5TrainStep.get_decoder_mask  s8    EJ:z/JSY$Z$Z$Z[['#-r\   c                     | j         \  }}|                     d          }t          j        ||df|          }||z  }|dk     }|S r  )r  r  r   r  )	rA  dec_seq_lengthr   r   _r  r  r  r  s	            rZ   get_enc_dec_maskzT5TrainStep.get_enc_dec_mask  s^    &,
A ,55a88"Z^Q(GPVWWW/2DD"4s":&&r\   c                     d }d }|j         j        j        |j         j        j        S |r	 ddlm} |S # t
          $ r Y nw xY w|S |S )Nc                    g d}t           j        }| t          |           }nd}t          j        |||          }|d                                         }|d                                         }|d                                         }|d                                         }|d         dk     }	|d	         dk     }
|d
         dk     }|||||	|
|fS )r+  )text_enctext_decr.  r0  enc_maskdec_maskenc_dec_maskNr  r  r.  r0  r  r  r  r  r2  )r   r8  r9  r   r:  
tokens_enc
tokens_decr.  r0  r  r  r  s               rZ   r>  z6T5TrainStep.get_batch_func.<locals>.get_batch_megatron  s     kjjD{H (M**$3D$IIF  
+0022J
+0022JH%**,,F{+1133Ij)C/Hj)C/H!.1C7Lz9fhR^^^r\   c                    t          |           }t          |t          j                                                  }|d                                         }|d                                         }|dk                        t          j                  }d|v r|d                                         }np|                    |j	        |j
        t          j                  }|dddf                                         |dd	df<   d
|d<   |                    |dk    d
           t                              |d                                                   }t                              |j	        d	         |j
                  }t                              |d                                         |j	        d	         |j
                  }|||||||fS )r+  r@  r.  rC  decoder_input_ids)r   r   .NrO  r   r   ).r   rA  )r4  r   r   rE  rF  r6  rG  r7  	new_zerosr  r   clonemasked_fill_r  r  r  r  )	r   r   r  r.  r0  r  r  r  r  s	            rZ   rH  z9T5TrainStep.get_batch_func.<locals>.get_batch_transformer  s   &&D!$
(A(A(C(CDDDk*//11J(^((**F4++EK88I"d**!"56;;==

#--fl6=X]Xb-cc
&,S#2#X&6&<&<&>&>
37#%&
6"''
d(:A>>>"88>N9O9T9T9V9VWWH"33J4DQ4GIZ[[H&77%&++--z/?/BJDU L z9fhR^^^r\   r   rI  )rb   rc   rJ  r   r#  r   rK  s         rZ   r  zT5TrainStep.get_batch_func  s    	_ 	_ 	_2	_ 	_ 	_. /IU$7QQ  
	)111111     %%((rL  c                 P    d }|j         j        j        |j         j        j        S |S )Nc                 
   |                                 }t          j        |                    d          |                     d          z            |                                 z  }|}t          |g          }|d|d         ifS )NrO  rQ  r   )r7  r   rR  rS  rT  r>   )r0  rW  rX  rZ  r\  r]  s         rZ   r$  z,T5TrainStep.get_loss_func.<locals>.loss_funcC  sw    $**,,Hib 1 1I4E4Eb4I4I IJJY]]__\GDG	RRO)_Q%7888r\   rd  )r|   rl   r$  s      rZ   r  zT5TrainStep.get_loss_funcB  s7    	9 	9 	9 /DP$7LLr\   c                       fd}|S )Nc           	          
                     |           \  }}}}}}} ||||||d|          }	|	t          
j        |          fS )rh  Nri  rk  )r   rY   r  r  r0  r=  r  r  r  rW  r|   s             rZ   r%  z7T5TrainStep.get_forward_step_func.<locals>.forward_stepQ  sr     ^b]k]k^ ^ZJ
Iy(Hl "EJ(LX\hq  M !'$.)"D"DDDr\   r   r  s   ` rZ   r  z!T5TrainStep.get_forward_step_funcP  s)    	E 	E 	E 	E 	E r\   )r   r   r   r   r   staticmethodr  r  r  r  r  r  r  r  s   @rZ   r  r    s         
6 
6 
6 
6 
6 
' 
' \
'   \
 	' 	' \	'=) =) =)~        r\   r  c                      t                      } t                       | j        dk    rt          d| j         d           t          | j        | j                   d S )Nr   z> setting random seeds to z ...)r   r)   rQ   rR   r   r*   data_parallel_random_init)rW   s    rZ   finish_mpu_initr  a  s[    ::D yA~~:49:::;;;TY >?????r\   c                    |                      d           t          j                                        s
J d            t	          |d          }|                                D ]]\  }}t          ||d           6|j        dk    r+t          d| dt          ||           d| d| d	           t          |||           ^|j	        s|
                    d
d          r |j        
J d            t          |           t          |           t          |           t                       t!                       t#                       t%                       t'                      }t          |dd           t)          |j        |          |_        |j        dk    r|j        r|j        dk    rd|_        nd|_        d|_        d S )NzInitializing Megatron-LMzMegatron requires CUDA.T)ignore_unknown_argsr   z*WARNING: overriding default arguments for r   r   )flushuse_checkpoint_argsFz/--use-checkpoints-args requires --load argumentrq  rC   r	   )rR   r   rE  is_availabler   r   r   rQ   r   r  getloadr   r   r&   r  r(   r'   r+   r   r7   orig_vocab_sizerq  rS   rP   rU   rT   	iteration)rl   extra_args_providerargs_defaultsrW   r   r   s         rZ   
initializer  n  s    0111:""$$??&???? )tDDDD $))++ " "
U4d##/yA~~rSrr7SWY\K]K]rrehrrkprr    	c5!!!! (=#4#45JE#R#R (y$$&W$$$!$'''$     ::Dt($//7!9$:NPT!U!Uv%%$*?%DOWXDXDX $ %DNNNr\   c                   v     e Zd ZdZ fdZd Zd Zd Zd Zd Z	d Z
d	 Zd
 Zd Zd Z	 	 	 	 	 	 	 	 ddZ xZS )MegatronEnginez
    Megatron-LM model wrapper

    Args:
        accelerator (:class:`~accelerate.Accelerator`): The accelerator object to use.
        model: Megatron-LM model
        optimizer: Megatron-LM optimizer
        lr_scheduler: Megatron-LM lr scheduler
    c                    t                                                       || _        |d         | _        || _        || _        t                      }|j        j        j	        - |j        j        j	        |fi |j        j        j
        | _        nz|j        dk    rt          ||          | _        nY|j        dk    rt          ||          | _        n8|j        dk    rt          ||          | _        nt!          d|j                   d| j        _        i | _        i | _        d| _        d| _        d| _        d | _        |j        t3                       d S d S )Nr   rC   rK   rL   rO   FT)r   r   module
base_modelrn   r^   r   rb   rc   custom_train_step_classcustom_train_step_kwargstrain_step_handlerrS   r   rn  r  rV   r  total_loss_dicteval_total_loss_dictr  report_memory_flag$num_floating_point_operations_so_farmodule_configtensorboard_dirr,   )r|   rl   rY   rn   r^   rW   r   s         rZ   r   zMegatronEngine.__init__  sm   (""zz/GS&bk&7&J&b' '#)<U' 'D## !V++&3K&F&FD##!U**&2;&E&ED##!T))&1+t&D&DD##N8LNNOOO&+#  "$&!"&451!+%''''' ,+r\   c                     t                      }t           j        d                   } j        j        |_        t           j        d         t                    r|j        r|j	        
J d            d  j        D             |_	        t           j                  dk    r|j	        d         |_	        |j        r@d  j        D             |_        t           j                  dk    r|j        d         |_        |j        rc|j        r\ fdt          t           j                            D             |_        t           j                  dk    r|j        d         |_        t"          |_        |S )Nr   zWhen overlap_grad_reduce is True, config.no_sync_func must be None; a custom no_sync_func is not supported when overlapping grad-reducec                     g | ]	}|j         
S r   )no_syncr   model_chunks     rZ   
<listcomp>z4MegatronEngine.get_module_config.<locals>.<listcomp>  s    "V"V"V;;#6"V"V"Vr\   r   c                     g | ]	}|j         
S r   )start_grad_syncr  s     rZ   r  z4MegatronEngine.get_module_config.<locals>.<listcomp>  s    (d(d(d)D(d(d(dr\   c                     g | ]fd 	S )c                 :    j                             |           S r   )rn   finish_param_sync)xmodel_indexr|   s    rZ   <lambda>z=MegatronEngine.get_module_config.<locals>.<listcomp>.<lambda>  s    $.::;JJ r\   r   )r   r  r|   s    @rZ   r  z4MegatronEngine.get_module_config.<locals>.<listcomp>  s4     & & &OZJJJJJ& & &r\   )r   r$   r  rn   
scale_lossgrad_scale_funcr   LocalDDPoverlap_grad_reduceno_sync_funcrj   delay_grad_reducegrad_sync_funcoverlap_param_gatherdelay_param_gatherr   param_sync_funcr   finalize_model_grads_func)r|   rW   rD   s   `  rZ   get_module_configz MegatronEngine.get_module_config  s   zz!$+a.11!%!:dk!nh// 	ED4L 	E&..V /.. #W"V$+"V"V"VF4;1$$&,&9!&<#% E(d(dX\Xc(d(d(d%t{##q((,2,A!,DF)$ 	C)@ 	C& & & &^cdghlhsdtdt^u^u& & &F" 4;1$$)/)?)B&+?(r\   c                     | j         D ]}|                                 | j        |                                 | _        |                                  d S r   )r  trainr  r  log_eval_resultsr|   model_modules     rZ   r  zMegatronEngine.train  s]     K 	! 	!L    %!%!7!7!9!9Dr\   c                     | j         D ]}|                                 | j        |                                 | _        d S d S r   )r  evalr  r  r  s     rZ   r  zMegatronEngine.eval  sS     K 	  	 L%!%!7!7!9!9D &%r\   c                 2   t                      g t          |          dk    r[j        dk    rMt          dj                  D ]6                    fd|                                D                        7n|gt          | j                  dk    rYt          |          dk    r-fdt          t          | j                            D             nd gt          | j                  z  }n$t          |          dk    rt                    nd }|S )Nr   r   c                 R    i | ]#\  }}||j         z  d z   j         z           $S )r   )r   )r   r   vrW   r   s      rZ   r   z:MegatronEngine.get_batch_data_iterator.<locals>.<dictcomp>  sM        $1 qT%:!:a!etG\=\!\]  r\   c                 .    g | ]}t                    S r   )iter)r   r  data_chunkss     rZ   r  z:MegatronEngine.get_batch_data_iterator.<locals>.<listcomp>  s!    DDDqk""DDDr\   )r   rj   r   r   r   r   r  r  )r|   
batch_databatch_data_iteratorrW   r  r   s      @@@rZ   get_batch_data_iteratorz&MegatronEngine.get_batch_data_iterator  sO   zzz??Q%))q$"899  A&&    (2(8(8(:(:       *lt{a z??Q&& EDDDE#dk2B2B,C,CDDDDVc$+...   8;:7J7J${"3"3"3PT""r\   c                     |                      |          }t          | j        j        || j        | j        | j        | j                  \  }}}}|dk    | j        _        ||||fS )z
        Training step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to train on.
        )forward_step_funcr   rY   rn   opt_param_schedulerrD   r   )	r  r<   r  r%  r  rn   r^   r  r  )r|   r  r  loss_reducedr  	grad_normnum_zeros_in_grads          rZ   r<   zMegatronEngine.train_step  s{     #:::FFCM"5B-+n $%D
 D
 D
@lI/@ '3a&7#\96GGGr\   c           	         t                      }|                     |          }t                      } || j        j        || j        t                      |j        |j        d          }|j	        dk    rt          j                                         |xj        t          j                    |j        z  t                      z  z  c_        t          j        d          rui }|d         D ]hfd|D             }t#          |d         j                  dk    r#t'          |          t#          |          z  |<   Qt          j        |          |<   i|S i S )z
        Evaluation step for Megatron-LM

        Args:
            batch_data (:obj:`dict`): The batch data to evaluate on.
        T)r	  r   rY   num_microbatchesr   r   forward_onlyr   )ignore_virtualr   c                      g | ]
}|         S r   r   )r   r  r   s     rZ   r  z,MegatronEngine.eval_step.<locals>.<listcomp>B  s    )E)E)EQ!C&)E)E)Er\   )r   r  r#   r  r%  r  r   r   r   empty_unused_memory_levelr   rE  empty_cacher   r   r   is_pipeline_last_stagerj   r  rR  r  )	r|   r  rW   r  forward_backward_func
loss_dictsr  losses_reduced_for_keyr   s	           @rZ   	eval_stepzMegatronEngine.eval_step"  st    zz":::FF 9 ; ;**"5B-+133!2
 
 

 )Q..J""$$$##,..1FFI]I_I__	
## %T::: 		 L!!} M M)E)E)E)E*)E)E)E&-a06771<<(+,B(C(CcJ`FaFa(aL%%(-5K(L(LL%%	r\   c                 "   t                      }| j        d         j        r | j        d
i |\  }}}}| xj        dz  c_        t          j                    |j        z  t                      z  }|xj	        |z  c_	        | xj
        t          ||          z  c_
        |j        | j                                                                        }d }	|j        rt#          | j                  }	t'          || j        | j        j        d         d         | j        || j        |||	|
  
        | _        n | j        d
i |}|j        |D ]}
| j                            |
t4          j                            dg                    ||
         z   | j        |
<   | j                            |
dz   t4          j                            dg                    t4          j                            dg          z   | j        |
dz   <   t5          j        dt4          j                                                  }|D ]+}
t?          ||
         j                   dk    r|||
         z  },d }d|v r|d         }| j!        j"        | j!        "                    ||	          S |S )Nr   r   lr        
_num_iters      ?r  ra  )r\  ra  r   )#r   r  trainingr<   r  r   r   r   r   r   r  r:   r  rn   get_loss_scaleitemlog_params_normr?   rY   r=   r  param_groupsr  r  r  r  r   rE  FloatTensorr   rF  rj   r  r  r'  )r|   r  rW   	loss_dictr  r  r  r   
loss_scaleparams_normr   r\  ra  s                rZ   forwardzMegatronEngine.forwardJ  s    zz;q>" !	6DSDODaDaV`DaDaAI|Y0ANNaNN9;;d>SSVjVlVllJ'':5''559VW[]g9h9hh55#/!^::<<AACC
"' B"5dj"A"AK*6(N/248N+ %+ +' '4444I#/$ 6 6C155c5:;Q;QSVRW;X;XYY\efi\jj -c2 EID]DaDal*EJ,B,BC5,I,IE E
..u55E6D-cL.@AA |C
(A(A(C(CDDD 	' 	'C9S>'((A--	#&y  x(F"5A*==4PV=WWWr\   c                     t                      }|j        | j        dk    rd S t                      }t                      }d| j         d}| j        D ]}|                    d          r| j        |         | j        |dz            z  }|| d| dz  }t          j        t          d|	                                                    }|j
        r|| d| dz  }|rW|                    | d|	                                | j                   |j
        r|                    | d	|| j                   t          |          d
z   }t          d|z             t          |           t          d|z             i | _        d S )Nr   zvalidation loss at iteration z | r  z value:    z PPL: z validationz validation pplr   -)r   r  r  r   r  endswithmathexpminr!  rP   
add_scalarrj   r   )r|   rW   writerstringr   r   ppllengths           rZ   r  zMegatronEngine.log_eval_results  s   zz'4>Q+>+>Fzz'))DDDD, 	T 	TC||L)) -c2T5NsUaOa5bbE00e0000F(3r5::<<0011C$ 1S000000 T!!S"5"5"5uzz||T^TTT( T%%&=&=&=sDNSSSVqf%%%f%%%$&!!!r\   c                 *   |                                   t                      }||_        t          j                                         t          | j        | j        | j	        | j
        | j                   t          j                                         d S )N)r  )r  r   saver   r   barrierr   r  r  rn   r^   r  )r|   
output_dirrW   s      rZ   r   zMegatronEngine.save_checkpoint  s    zz	!!###NKNN151Z	
 	
 	
 	
 	!!#####r\   c                    t                      }||_        d|_        d|_        t          j                                         t          | j        | j	        | j
                  \  }}t          j                                         || _        || _        |j        r&| j        dk    r| j	                                         d S d S d S r  )r   r  r   r   r   r   r7  r   r  rn   r^   r  r  fp16reload_model_params)r|   	input_dirrW   r  r  s        rZ   r   zMegatronEngine.load_checkpoint  s    zz	&'#&'#!!###:I$+W[Wegkgu:v:v7	7!!###"4X19 	11,,N..00000	1 	1,,r\   Nc
                 4   t                      }|j        dk    rt          d          |j        dk    rt	          d          |j        rt	          d          |j        t	          d          |j        t	          d          ||t	          d	          |d
}nd|cxk     rdk    sn t	          d          |d}nd|cxk    rdk    sn t	          d          |d}n9|dk    r|dk    rt	          d          d|cxk    rd
k    sn t	          d          |
                    dd          }d|cxk    rd
k    sn t	          d          |
                    dd          }d|cxk    rd
k    sn t	          d          |
                    dd          }t          |t                    st	          d          |}|Lt          |t                    st	          d          |dk     rt	          d          |j        d         dk    rdS t                      }|
                    d|j                  }|$t          |t                    st	          d          |	d
}	d}d}d}t          j                                        dk    r|:t          j                            |j        d         g|j        d         z            }n(|                    d                                           }|||j        d         z
  }|dk    rt	          d!          |r||j        d         z   dz   }d"t+          j        |d"z            z  }||j        d         dz   z
  }t          j                            |j        g|z  g|j        d         z            }t          j        t          j        |dddf         d           |                                |gd           }n||j        d         z   }d"t+          j        |d"z            z  }||j        d         z
  }t          j                            |j        g|z  g|j        d         z            }t          j        |                                |gd           }|                    d          |                    d          g}t5          d#|d$          }|                                }t9          |t          j        |d%          }t9          |d         t          j        |d%          }|
                    d&d          }t          j                            |           tA          | j!        tD          tF          tH          f          }|tK          |||||d|	'          \  }}ntM          |||d|||||d()
  
        \  }}}|S )*a  
        Generate method for GPT2 model. This method is used for inference. Supports both greedy and beam search along
        with sampling. Refer the Megatron-LM repo for more details

        Args:
            inputs (torch.Tensor): input ids
            attention_mask (torch.Tensor, optional): attention mask. Defaults to None.
            max_length (int, optional): max length of the generated sequence. Defaults to None.
            Either this or max_new_tokens should be provided.
            max_new_tokens (int, optional): max number of tokens to be generated. Defaults to None.
            Either this or max_length should be provided.
            num_beams (int, optional): number of beams to use for beam search. Defaults to None.
            temperature (float, optional): temperature for sampling. Defaults to 1.0.
            top_k (int, optional): top k tokens to consider for sampling. Defaults to 0.0.
            top_p (float, optional): tokens in top p probability are considered for sampling. Defaults to 0.0.
            length_penalty (float, optional): length penalty for beam search. Defaults to None.
            kwargs: additional key-value arguments
        rK   z1Generate method is not implemented for this modelr   z1Generate method requires data parallelism to be 1z9Generate method requires sequence parallelism to be FalseNz2Checkpoint activations cannot be set for inferencez$Vocab file is required for inferencez;`max_length` or `max_new_tokens` are required for inferencer  r  g      Y@zAtemperature must be a positive number less than or equal to 100.0r   i  z:top_k must be a positive number less than or equal to 1000z/top_p and top_k sampling cannot be set togetherz'top_p must be less than or equal to 1.0top_p_decayz-top_p_decay must be less than or equal to 1.0top_p_boundz-top_p_bound must be less than or equal to 1.0add_BOSFzadd_BOS must be a booleanzbeam_width must be an integerz!beam_width must be greater than 0z,When doing beam_search, batch size must be 1
stop_tokenzstop_token must be an integerrO  )axisz%max_new_tokens must be greater than 0   r	   )int_listrQ   )r   rQ   random_seed)rA  num_return_genlength_penaltyT)return_output_log_probstop_ktop_pr>  r?  temperature#use_eod_token_for_early_termination)'r   rS   NotImplementedErrordata_parallel_sizerV   sequence_parallelrecompute_granularityrs  r  r   r   r`  r  r   rt  r   r   r  rE  
LongTensorrR  r-  ceilr  r  sizer3   tolistr4   r3  randommanual_seedrA   r  torchDDPr  r.   r5   r6   )r|   inputsrA  
max_lengthmax_new_tokens	num_beamsrK  rI  rJ  rG  r   rW   r>  r?  r@  
beam_widthrx  rA  
sizes_listprompts_tokens_tensorprompts_length_tensorr  sizes_tensorsizescontext_tokens_tensorcontext_length_tensorrE  unwrapped_modelr;  r  s                                 rZ   megatron_generatez MegatronEngine.megatron_generate  s/   B zz5((%&YZZZ"Q&&PQQQ! 	ZXYYY%1QRRR?"CDDD ."8Z[[[KK,,,,u,,,,`aaa=EEu$$$$$$$$YZZZ=EES[[US[[NOOO5''''C'''' !JKKKjj44{))))c))))LMMMjj44{))))c))))LMMM**Y..7D)) 	:8999
!j#.. B !@AAAA~~ !DEEE|A""EE!OO	ZZim<<
!j#.. B !@AAA! N
 $ $%%''1,,%(-
(=(=v|A>ORXR^_`Ra>a(b(b%%(6(:(:(:(C(C(H(H(J(J%%!+fl1o!="" !HIII X+fl1o=A
:>!:!::
!+v|A/B!C*//)->1Q0RU[UabcUd0dee(-_WQQQT]<<<fkkmmWU\^) ) )%%
 ,fl1o=
:>!:!::
!+fl1o!=*//)->1Q0RU[UabcUd0dee(-fkkmmW5MTV(W(W(W% &**1--%**1--J *!jqIII ##%% 0Lahi j j j 0q5;Odkl m m m jj22  ---&t8]8[\\!=%%% -  IFAA K%%(-'''48  LFAq r\   )NNNNNNNN)r   r   r   r   r   r  r  r  r  r<   r  r(  r  r   r   re  r  r  s   @rZ   r  r    s        ( ( ( ( (>  4     : : :# # #2H H H.& & &P= = =~' ' '4$ $ $1 1 1  p p p p p p p pr\   r  c                      t          |           S )z
    Average losses across data parallel group.

    Args:
        losses (List[Tensor]): List of losses to average across data parallel group.
    )r>   )r  s    rZ   %avg_losses_across_data_parallel_grouprg  r  s     5V<<<r\   c                 ,    d }t          || d          S )z
    Recursively gather tensor in a nested list/tuple/dictionary of tensors from data parallel ranks.

    Args:
        tensor (nested list/tuple/dictionary of `torch.Tensor`):
            The data to gather across data parallel ranks.

    c                 t     j         dk    r                                 d            fdt          t          j                            t          j                                        D             }t          j                            | t          j                               t          j	        |d          S )Nr   c                 8    g | ]}t          j                  S r   )r   
empty_like)r   r  r   s     rZ   r  zOgather_across_data_parallel_groups.<locals>._gpu_gather_one.<locals>.<listcomp>  s4     
 
 
 V$$
 
 
r\   r   r  )
ndimr  r   r   r   get_world_sizer   get_data_parallel_group
all_gatherr  )r   output_tensorss   ` rZ   _gpu_gather_onez;gather_across_data_parallel_groups.<locals>._gpu_gather_one  s    ;!\\^^D)F
 
 
 
5,;;#B]B_B_;``aa
 
 
 	$$^V3C^C`C`$aaayQ////r\   T)error_on_other_type)r   )r   rq  s     rZ   "gather_across_data_parallel_groupsrs  }  s(    0 0 0 _f$OOOOr\   )TTTT)wrv   r-  r  abcr   	functoolsr   r   torch.nn.functionalnn
functionalrU  torch.nnr   r   r   torch.nn.parallel.distributedr   rW  rn   r
   r^   r   importsr   
operationsr   r   megatronr   r   r   r   r   megatron.argumentsr   r   r   r   r   megatron.checkpointingr   r   r   megatron.corer   r   megatron.core.distributedr  r   megatron.core.enumsr    megatron.core.parallel_stater!   r"   megatron.core.pipeline_parallelr#   megatron.core.utilsr$   megatron.data.dataset_utilsr%   megatron.global_varsr&   megatron.initializer'   r(   r)   r*   r+   r,   megatron.modelr-   r.   r/   r0   megatron.model.classificationr1   megatron.optimizerr2   &megatron.text_generation.communicationr3   r4   #megatron.text_generation.generationr5   r6   megatron.tokenizer.tokenizerr7   megatron.trainingr8   r9   r:   r;   r<   r=   megatron.utilsr>   r?   r@   rA   r[   rq   rs   r   r   r   rf   r
  r  rg   r  r   rn  r  r  r  Moduler  rg  rs  r   r\   rZ   <module>r     sH     				                       A A A A A A A A A A M M M M M M , , , , , , , , , , , , - - - - - - 9 9 9 9 9 9 9 9  7                           cbbbbbbbbb22222222MMMMMM>>>>>>------ppppppppIIIIII444444KKKKKK999999                KJJJJJJJJJJJ<<<<<<999999[[[[[[[[        FEEEEE                          2 2 2 2j' ' '>kL kL kL kL kL kL kL kL\  $>L >L >LD+ + + + +!5 + + + b b b       .    !5            "K K K K K% K K K\A A A A A$ A A AHN N N N N# N N Nb	@ 	@ 	@ 15B - - - -`P P P P PUX_ P P Ph= = =P P P P Pr\   