
    קgJ;                     B   d dl Z d dlmZ d dlmZ d dlmZmZ ej        fdZ	d ej        fdZ
d ej        fdZej        ej        fdZej        ej        fdZej        fd	Zej        fd
Zej        fdZddej        fdZej        ej        fdZ G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d de          Z G d  d!e          ZdS )"    N)Function)groupReduceOpc                 :    t                               |||           S )a  
    Broadcasts the tensor to the whole group.

    ``tensor`` must have the same number of elements in all processes
    participating in the collective.

    Arguments:
        tensor (Tensor): Data to be sent if ``src`` is the rank of current
            process.
        src (int): Source rank.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Received tensor from the broadcast op.

    )
_Broadcastapply)tensorsrcr   s      [/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/distributed/nn/functional.py	broadcastr      s    " C///    c                 :    t                               |||           S )aT  
    Gathers a list of tensors in a single process.

    Arguments:
        tensor (Tensor): Input tensor.
        dst (int, optional): Destination rank (default is 0).
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple[Tensor]: List of appropriately-sized tensors with the gathered data.
    )_Gatherr   )r	   dstr   s      r   gatherr       s     ==eV,,,r   c                 (    t          j        ||g| R  S )a  
    Scatters a list of tensors to all processes in a group.

    Each process will receive exactly one tensor and store its data in the
    ``tensor`` argument.

    Arguments:
        tensors (list[Tensor]): List of tensors to scatter on the source rank.
            Receivers must pass ``None`.
        src (int, optional): Source rank (default is 0).
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output tensor from the scatter operation.

    )_Scatterr   )tensorsr
   r   s      r   scatterr   /   s    " >#u/w////r   c                 <    t                               ||||           S )a  
    Reduces the tensor data across all machines.

    Only the process with rank ``dst`` is going to receive the final result.

    Arguments:
        tensor (Tensor): Input of the collective.
        dst (int): Destination rank.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective.

    )_Reducer   )r	   r   opr   s       r   reducer   C   s    $ ==b%000r   c                 *    t          j        ||| g|R  S )a  
    Reduces, then scatters a list of tensors to all processes in a group.

    Arguments:
        output (Tensor): Output tensor.
        input_list (list[Tensor]): List of tensors to reduce and scatter.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective.

    )_Reduce_Scatterr   )output
input_listr   r   s       r   reduce_scatterr   X   s!       UF@Z@@@@r   c                 8    t                               ||           S )a  
    Gathers tensors from the whole group in a list.

    Arguments:
        tensor (Tensor): Tensor to be broadcast from current process.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple([Tensor]): Output of the collective.

    )
_AllGatherr   )r	   r   s     r   
all_gatherr!   k   s     E6***r   c                 :    t                               | ||          S )a  
    Single tensor all gather. Gathers a single tensor from all ranks, and puts them in a single output tensor.

    Args:
        output_tensor (Tensor): Output tensor. It should contain
            correctly-sized tensors to be used for output of the collective.
        input_tensor (Tensor): Tensor to be broadcast from current process.
        group (ProcessGroup, optional): The process group to work on. If None,
            the default process group will be used.

    Examples:
        >>> # All tensors below are of torch.int64 dtype.
        >>> # We have 2 process groups, 2 ranks.
        >>> # xdoctest: +SKIP("incorrect want text")
        >>> output_tensor = torch.zeros(2, dtype=torch.int64)
        >>> output_tensor
        [tensor([0, 0])] # Rank 0 and 1
        >>> tensor = torch.arange(1, dtype=torch.int64) + 1 + rank
        >>> tensor
        tensor([1]) # Rank 0
        tensor([2]) # Rank 1
        >>> dist.all_gather_base(output_tensor, tensor)
        >>> output_tensor
        tensor([1,2]) # Rank 0
        tensor([1,2]) # Rank 1

    .. warning::
        `_all_gather_base` is experimental and subject to change.
        It is the caller's responsibility to ensure the output_tensor
        is correctly sized.

    )_AllGatherBaser   )output_tensorinput_tensorr   s      r   _all_gather_baser&   z   s    B |UCCCr   c                 (    t          j        || g|R  S )a  
    Each process scatters list of input tensors to all processes in a group and return gathered list of tensors in output list.

    Arguments:
        output_tensor_list (list[Tensor]): list of tensors to gather one per rank.
        input_tensor_list (list[Tensor]): List of tensors to scatter one per rank.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        tuple([Tensor]): Output of the collective.

    )	_AlltoAllr   )output_tensor_listinput_tensor_listr   s      r   
all_to_allr+      s      ?5"4I7HIIIIr   c                 >    t                               || |||          S )a  
    Each process splits input tensor and then scatters the split list to all processes in a group.

    Then concatenate the received tensors from all the processes in the group and return single output tensor.

    Arguments:
        output (Tensor): Gathered concatenated output tensor.
        input (Tensor): Input tensor to scatter.
        output_split_sizes: (list[Int], optional): Output split sizes for dim 0
            if specified None or empty, dim 0 of ``output`` tensor must divide
            equally by ``world_size``.
        input_split_sizes: (list[Int], optional): Input split sizes for dim 0
            if specified None or empty, dim 0 of ``input`` tensor must divide
            equally by ``world_size``.

    Returns:
        Tensor: Output of the collective.

    )_AlltoAllSingler   )r   inputoutput_split_sizesinput_split_sizesr   s        r   all_to_all_singler1      s)    4   v)+<e  r   c                 :    t                               |||           S )a&  
    Reduces the tensor data across all machines in such a way that all get the final result.

    After the call the returned tensor is going to be bitwise
    identical in all processes.

    Arguments:
        tensor (Tensor): Input of the collective.
        op (optional): One of the values from
            ``torch.distributed.ReduceOp``
            enum.  Specifies an operation used for element-wise reductions.
        group (ProcessGroup, optional): The process group to work on.

    Returns:
        Tensor: Output of the collective

    )
_AllReducer   )r	   r   r   s      r   
all_reducer4      s    $ Bv...r   c                   :    e Zd Zed             Zed             ZdS )r   c                     || _         || _        t          j        |          | _        |                                }t          j        |||           |S Nr   )r
   r   distget_rankrankcloner   )ctxr
   r   r	   s       r   forwardz_Broadcast.forward   sO    	=u--- vs%0000r   c                     t                               | j        t          j        | j        |          }| j        | j        k    r|                                 d d |fS N)r   r   r
   r   SUMr   r;   zero_)r=   grad_outputgxs      r   backwardz_Broadcast.backward   sH    ]]37HL#)[II7chHHJJJdBr   N__name__
__module____qualname__staticmethodr>   rE    r   r   r   r      sH          \     \     r   r   c                   :    e Zd Zed             Zed             ZdS )r   c                 V   || _         || _        fdt          t          j        |                    D             }                                t          j        |          |k    rt          j        |||           nt          j        d ||           t          |          S )Nc                 8    g | ]}t          j                  S rK   )torch
zeros_like).0ir	   s     r   
<listcomp>z#_Gather.forward.<locals>.<listcomp>   s1     
 
 
)*EV$$
 
 
r   r8   )	r   r   ranger9   get_world_size
contiguousr:   r   tuple)r=   r   r   r	   tensor_lists      ` r   r>   z_Gather.forward   s    	

 
 
 
.3D4Ge4T4T4T.U.U
 
 
 ""$$=u%%%,,KS>>>>>Kc7777[!!!r   c                 D    dt          j        | j        | j        g|R  fz   S NNN)r   r   r   r   )r=   grad_outputss     r   rE   z_Gather.backward
  s(    x~cgsyP<PPPRRRr   NrF   rK   r   r   r   r      sM        " " \"$ S S \S S Sr   r   c                   :    e Zd Zed             Zed             ZdS )r   c                 @   || _         || _        t          fdD                       sJ t          j        d                   }t          j        |          |k    r&t          j        |t                    ||           nt          j        |d ||           |S )Nc              3   x   K   | ]4}|                                 d                                           k    V  5dS )r   Nsize)rQ   tr   s     r   	<genexpr>z#_Scatter.forward.<locals>.<genexpr>  s>      BBQ16688wqz000BBBBBBr   r   r8   )	r
   r   allrO   rP   r9   r:   r   list)r=   r
   r   r   r   s      ` r   r>   z_Scatter.forward  s    	BBBB'BBBBBBBB!'!*--=u%%%,,Lg5AAAAALs%8888r   c                 T    dt                               | j        | j        |          z   S rZ   )r   r   r
   r   r=   rC   s     r   rE   z_Scatter.backward  s!    gmmCGSYLLLLr   NrF   rK   r   r   r   r     sM        	 	 \	 M M \M M Mr   r   c                   :    e Zd Zed             Zed             ZdS )r   c                 z    || _         || _        |                                }t          j        ||||           |S Nr   r   )r
   r   r<   r9   r   )r=   r
   r   r   r	   s        r   r>   z_Reduce.forward"  s;    	FCBe4444r   c                 V    dt                               | j        | j        |          fz   S N)NNN)r   r   r
   r   rg   s     r   rE   z_Reduce.backward*  s&    !Z%5%5cgsy+%V%V$XXXr   NrF   rK   r   r   r   r   !  sM          \ Y Y \Y Y Yr   r   c                   :    e Zd Zed             Zed             ZdS )r   c                     || _         |                                }t          d |D                       }t          j        |t          |          ||           |S )Nc              3   >   K   | ]}|                                 V  d S r@   rV   rQ   rb   s     r   rc   z*_Reduce_Scatter.forward.<locals>.<genexpr>5  s*      !L!LQ!,,..!L!L!L!L!L!Lr   rk   )r   rV   rW   r9   r   re   )r=   r   r   r	   r*   s        r   r>   z_Reduce_Scatter.forward0  s`    	""$$!!L!L:K!L!L!LLLFD):$;$;%PPPPr   c                 H    dt                               | j        |          z   S rm   )r    r   r   rg   s     r   rE   z_Reduce_Scatter.backward9  s    !J$4$4SY$L$LLLr   NrF   rK   r   r   r   r   /  sM          \ M M \M M Mr   r   c                   :    e Zd Zed             Zed             ZdS )r    c                                                      || _        fdt          t          j        |                    D             }t          j        ||           t          |          S )Nc                 8    g | ]}t          j                  S rK   rO   
empty_like)rQ   _r	   s     r   rS   z&_AllGather.forward.<locals>.<listcomp>E  s1     
 
 
)*EV$$
 
 
r   r8   )rV   r   rT   r9   rU   r!   rW   )r=   r   r	   out_tensor_lists     ` r   r>   z_AllGather.forward?  s     ""$$	
 
 
 
.3D4Ge4T4T4T.U.U
 
 
 	u====_%%%r   c                    t          j        | j                  t           j        j        u rXt          j        | j                  }t          j        ||                   }t          j	        t          j        | j        |g|R  }nLd |D             }t          j	        | j        |g|R  }t          j        t          j        |          d          }d |fS )Nr8   c                 6    g | ]}t          j        |          S rK   rw   )rQ   r	   s     r   rS   z'_AllGather.backward.<locals>.<listcomp>U  s#    OOO5+F33OOOr   r   )dim)r9   get_backendr   BackendNCCLr:   rO   rx   r   r   r   rA   r(   sumstack)r=   r\   r;   rD   rX   gxss         r   rE   z_AllGather.backwardL  s    #),,,0AAA=sy111D!,t"455B &x|SYR\RRRBB PO,OOOK/#)[H<HHHC5;s++333Bbzr   NrF   rK   r   r   r    r    >  sH        
& 
& \
&   \  r   r    c                   :    e Zd Zed             Zed             ZdS )r#   c                 f    || _         t          j        ||                                |           |S r7   )r   r9   r&   rV   )r=   r$   r%   r   s       r   r>   z_AllGatherBase.forward\  s3    	m\-D-D-F-FeTTTTr   c                    t          j        | j                  t           j        j        u rt          j        | j                  }t          |                                          }|d         |z  dk    rt          d| d|           |d         t          j        | j                  z  |d<   t          j
        ||j        |j                  }t          j        ||t          j        | j                   nt          d          d |d fS )Nr8   r   zTensor with dimensions: z8 does not have first dimension divisible by world_size: devicedtypezBackend not supported!)r9   r~   r   r   r   rU   re   ra   RuntimeErrorrO   emptyr   r   _reduce_scatter_baser   rA   )r=   rC   
world_sizeout_sizerD   s        r   rE   z_AllGatherBase.backwardb  s   #),,,0AAA,39===JK,,..//H{Z'1,,"Vx V VISV V   #1+)<39)M)M)MMHQK!3;;L  B %b+x|SYOOOO7888b$r   NrF   rK   r   r   r#   r#   [  sH          \
     \     r   r#   c                   :    e Zd Zed             Zed             ZdS )r(   c                 <   || _         fdt          t          j        |                    D             | _        t          j        |          }t          d D                       t          j        |          t          j        j	        u r[t          t          j        |                    D ]7}d }||k    rt                    }t          j        ||         |||           8n$t          j        |t                    |           t          |          S )Nc                 D    g | ]}|                                          S rK   r`   )rQ   rR   r   s     r   rS   z%_AlltoAll.forward.<locals>.<listcomp>z  s4     &
 &
 &
"#GAJOO&
 &
 &
r   r8   c              3   >   K   | ]}|                                 V  d S r@   rq   rr   s     r   rc   z$_AlltoAll.forward.<locals>.<genexpr>~  s*      881888888r   )r   rT   r9   rU   input_tensor_size_listr:   rW   r~   r   GLOOre   r   r+   )r=   r   rz   r   my_rankrR   to_sends      `   r   r>   z_AlltoAll.forwardw  s:   	&
 &
 &
 &
',T-@u-M-M-M'N'N&
 &
 &
" -e,,,8888888%(((DL,===4.U;;;<< J J<<"7mmG_Q/!5IIIII	J OW   
 _%%%r   c                 `    fd| j         D             }dt          j        | j        |gR  z   S )Nc                 j    g | ]/}t          j        |d          j        d          j                  0S )r   r   )rO   r   r   r   )rQ   ra   r\   s     r   rS   z&_AlltoAll.backward.<locals>.<listcomp>  sQ     
 
 
  K\!_3<?;P  
 
 
r   r[   )r   r(   r   r   )r=   r\   rX   s    ` r   rE   z_AlltoAll.backward  sS    
 
 
 
 2	
 
 
 iociT|TTTTTr   NrF   rK   r   r   r(   r(   v  sM        & & \&, U U \U U Ur   r(   c                   :    e Zd Zed             Zed             ZdS )r-   c                     || _         |                                | _        || _        || _        t          j        |||||           |S )N)r/   r0   r   )r   ra   
input_sizer/   r0   r9   r1   )r=   r   r   r/   r0   r.   s         r   r>   z_AlltoAllSingle.forward  sX    	!2 21/	
 	
 	
 	
 r   c           	          t          j        | j        |j        |j                  }dt
                              | j        || j        | j	        |
                                          fz   S )Nr   )NNNN)rO   r   r   r   r   r-   r   r   r/   r0   rV   )r=   rC   r	   s      r   rE   z_AlltoAllSingle.backward  sq    N;#5[=N
 
 
 (!!	&%&&(( +
 
 	
r   NrF   rK   r   r   r-   r-     sH          \ 
 
 \
 
 
r   r-   c                   :    e Zd Zed             Zed             ZdS )r3   c                 x    || _         || _        |                                }t          j        |||           |S rj   )r   r   r<   r9   r4   )r=   r   r   r	   s       r   r>   z_AllReduce.forward  s9    	2U3333r   c                 V    dt                               | j        | j        |          fz   S rZ   )r3   r   r   r   rg   s     r   rE   z_AllReduce.backward  s&    z//	;OOQQQr   NrF   rK   r   r   r3   r3     sM          \ R R \R R Rr   r3   )rO   torch.distributeddistributedr9   torch.autogradr   r   r   WORLDr   r   r   rA   r   r   r!   r&   r+   r1   r4   r   r   r   r   r   r    r#   r(   r-   r3   rK   r   r   <module>r      s`                # # # # # #
 . - - - - - - - "' 0 0 0 0(  - - - - %+ 0 0 0 0( $<u{ 1 1 1 1* +3,ek A A A A& #[ + + + + 9> !D !D !D !DH =BK J J J J& 
+   > #,ek / / / /*               (S S S S Sh S S S2M M M M Mx M M M$Y Y Y Y Yh Y Y YM M M M Mh M M M       :         X      6 U  U  U  U  U  U  U  UF
 
 
 
 
h 
 
 
@R R R R R R R R R Rr   