
    Χg*                         d dl Z d dlmZ d dlZd dlmZmZmZmZm	Z	m
Z
 d dlmZ ddddZddZdd	Zdd
ZddddZddddZdS )    N)List)_flatten_dense_tensors_get_device_index_handle_complex_reorder_tensors_as_take_tensors_unflatten_dense_tensors)nccl)outc                    t          |           } |du |du z  st          d| d|           |,d |D             }t          j                            | |          S t          j                            | |          S )a  Broadcasts a tensor to specified GPU devices.

    Args:
        tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to broadcast.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing copies of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a copy of
            :attr:`tensor`.
    NzFExactly one of 'devices' and 'out' must be specified, but got devices=z	 and out=c                 ,    g | ]}t          |          S  r   .0ds     R/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/nn/parallel/comm.py
<listcomp>zbroadcast.<locals>.<listcomp>,   !    999A$Q''999    )r   RuntimeErrortorch_C
_broadcast_broadcast_out)tensordevicesr   s      r   	broadcastr      s    * V$$F_- 
lU\llgjll
 
 	
 99999x""67333x&&vs333r      c                 t    d |D             }d | D             } t           j                            | ||          S )a.  Broadcast a sequence of tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number of synchronizations.

    Args:
        tensors (sequence): tensors to broadcast. Must be on the same device,
          either CPU or GPU.
        devices (Iterable[torch.device, str or int]): an iterable of GPU
          devices, among which to broadcast.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of :attr:`tensor`, placed on :attr:`devices`.
    c                 ,    g | ]}t          |          S r   r   r   s     r   r   z'broadcast_coalesced.<locals>.<listcomp>A   s!    555 ##555r   c                 ,    g | ]}t          |          S r   r   r   ts     r   r   z'broadcast_coalesced.<locals>.<listcomp>B        333aq!!333r   )r   r   _broadcast_coalesced)tensorsr   buffer_sizes      r   broadcast_coalescedr*   2   sD     65W555G337333G8((';GGGr   c           	         t          |d          }| d                                         }dt          |           D ]\  }}|j        j        dk    s
J d            |                                |k    r||                                |k    rhd                    d |                                D                       }d                    d	 |D                       }t          d
| d| d|           t          d          t          |           dk    r| d         S t          j        |           r2t          j        |                    }t          j        | |           nt          j        |          j        j        |          }fdt          |           D             }	|          |	d                             |d          z   }|	dd         D ],}
|                    |
                    |d                     -|S )a  Sum tensors from multiple GPUs.

    All inputs should have matching shapes, dtype, and layout. The output tensor
    will be of the same shape, dtype, and layout.

    Args:
        inputs (Iterable[Tensor]): an iterable of tensors to add.
        destination (int, optional): a device on which the output will be
            placed (default: current device).

    Returns:
        A tensor containing an elementwise sum of all inputs, placed on the
        :attr:`destination` device.
    T)optionalr   Ncpuz+reduce_add expects all inputs to be on GPUsxc              3   4   K   | ]}t          |          V  d S Nstrr   r.   s     r   	<genexpr>zreduce_add.<locals>.<genexpr>]   s(      66a3q66666666r   c              3   4   K   | ]}t          |          V  d S r0   r1   r3   s     r   r4   zreduce_add.<locals>.<genexpr>^   s(      ;;1A;;;;;;r   zinput z has invalid size: got z, but expected zLreduce_add expects destination to be on the same GPU with one of the tensors   )outputrootc                 &    g | ]\  }}|k    |S r   r   )r   ir%   
root_indexs      r   r   zreduce_add.<locals>.<listcomp>o   s"    FFFAa:oo1ooor   )devicenon_blocking)r   size	enumerater<   type
get_devicejoin
ValueErrorr   lenr
   is_availabler   
empty_likereducetoadd_)inputsdestination
input_sizer:   inpgotexpectedresultdestination_devicenonroototherr;   s              @r   
reduce_addrT   F   s=    $K$???K!!JJF## 	 	3z%''')V'''>>{**J88::##((66388::66666Cxx;;
;;;;;HQQQ3QQxQQ   $ Z
 
 	
 6{{aay   P!&"455F6
;;;;;"\&*<*C*H+VVFFFF6!2!2FFF
#gajmm%D '4 '
 '
 
 QRR[ 	P 	PEKK(:NNOOOOMr   c                    d | D             }g }g }t          |  D ]}t          d |D                       rAt          ||          }|                    |           |                    |d                    \t          ||          D ]5\  }}	|                    |	j        r|	                                n|	           6|                    |d         d                    Ĉfd|D             }
t          |
 D ]Q}d |D             }t          ||          }t          ||d                   D ]}	|                    |	j                   Rt          t          ||                    S )a\  Sum tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Args:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    c                     g | ]}g S r   r   )r   _s     r   r   z(reduce_add_coalesced.<locals>.<listcomp>   s     4 4 4 4 4 4r   c              3   $   K   | ]}|j         V  d S r0   )	is_sparser$   s     r   r4   z'reduce_add_coalesced.<locals>.<genexpr>   s$      33qq{333333r   r   c                 0    g | ]}t          |          S r   )r   )r   r(   r)   s     r   r   z(reduce_add_coalesced.<locals>.<listcomp>   s#    MMMGM';//MMMr   c                 ,    g | ]}t          |          S r   )r   )r   chunks     r   r   z(reduce_add_coalesced.<locals>.<listcomp>   s.     
 
 
.3"5))
 
 
r   )
zipallrT   appendrY   to_denser	   datatupler   )rJ   rK   r)   dense_tensorsr7   	ref_ordertensor_at_gpusrP   collr%   itrschunksflat_tensorsflat_results     `           r   reduce_add_coalescedrl   y   s   & !5 4V 4 4 4MFIv, 3 333N33333 	3<<FMM&!!!^A.////}n== @ @aAK>AJJLLLQ????]1-b12222MMMM}MMMDt* 	" 	"
 
7=
 
 
 !{;;)+vayAA 	" 	"A MM!&!!!!		"
 $VY77888r   c          	      J   t          |           } |<d |D             }t          t          j                            | ||||                    S |t          d|           |t          d|           t          t          j                            | |||                    S )a<  Scatters tensor across multiple GPUs.

    Args:
        tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
        devices (Iterable[torch.device, str or int], optional): an iterable of
          GPU devices, among which to scatter.
        chunk_sizes (Iterable[int], optional): sizes of chunks to be placed on
          each device. It should match :attr:`devices` in length and sums to
          ``tensor.size(dim)``. If not specified, :attr:`tensor` will be divided
          into equal chunks.
        dim (int, optional): A dimension along which to chunk :attr:`tensor`.
          Default: ``0``.
        streams (Iterable[torch.cuda.Stream], optional): an iterable of Streams, among
          which to execute the scatter. If not specified, the default stream will
          be utilized.
        out (Sequence[Tensor], optional, keyword-only): the GPU tensors to
          store output results. Sizes of these tensors must match that of
          :attr:`tensor`, except for :attr:`dim`, where the total size must
          sum to ``tensor.size(dim)``.

    .. note::
        Exactly one of :attr:`devices` and :attr:`out` must be specified. When
        :attr:`out` is specified, :attr:`chunk_sizes` must not be specified and
        will be inferred from sizes of :attr:`out`.

    Returns:
        - If :attr:`devices` is specified,
            a tuple containing chunks of :attr:`tensor`, placed on
            :attr:`devices`.
        - If :attr:`out` is specified,
            a tuple containing :attr:`out` tensors, each containing a chunk of
            :attr:`tensor`.
    Nc                 ,    g | ]}t          |          S r   r   r   s     r   r   zscatter.<locals>.<listcomp>   r   r   zI'devices' must not be specified when 'out' is specified, but got devices=zQ'chunk_sizes' must not be specified when 'out' is specified, but got chunk_sizes=)r   rc   r   r   _scatterr   _scatter_out)r   r   chunk_sizesdimstreamsr   s         r   scatterrt      s    D V$$F
{99999UX&&vwS'RRSSSe\cee   "qdoqq   UX**63WEEFFFr   c                2   d | D             } |U|dk    rt          j        dt          d           t          |dd          }t          j                            | ||          S |t          d	|           t          j                            | ||          S )
a  Gathers tensors from multiple GPU devices.

    Args:
        tensors (Iterable[Tensor]): an iterable of tensors to gather.
          Tensor sizes in all dimensions other than :attr:`dim` have to match.
        dim (int, optional): a dimension along which the tensors will be
          concatenated. Default: ``0``.
        destination (torch.device, str, or int, optional): the output device.
          Can be CPU or CUDA. Default: the current CUDA device.
        out (Tensor, optional, keyword-only): the tensor to store gather result.
          Its sizes must match those of :attr:`tensors`, except for :attr:`dim`,
          where the size must equal ``sum(tensor.size(dim) for tensor in tensors)``.
          Can be on CPU or CUDA.

    .. note::
        :attr:`destination` must not be specified when :attr:`out` is specified.

    Returns:
        - If :attr:`destination` is specified,
            a tensor located on :attr:`destination` device, that is a result of
            concatenating :attr:`tensors` along :attr:`dim`.
        - If :attr:`out` is specified,
            the :attr:`out` tensor, now containing results of concatenating
            :attr:`tensors` along :attr:`dim`.
    c                 ,    g | ]}t          |          S r   r#   r$   s     r   r   zgather.<locals>.<listcomp>   r&   r   NrZ   zjUsing -1 to represent CPU tensor is deprecated. Please use a device object or string instead, e.g., "cpu".   )
stacklevelT)	allow_cpur,   zQ'destination' must not be specified when 'out' is specified, but got destination=)	warningswarnFutureWarningr   r   r   _gatherr   _gather_out)r(   rr   rK   r   s       r   gatherr      s    4 437333G
{"M@	    (tdSSSxk:::"qdoqq   x##GS#666r   r0   )r   )Nr   )NNr   N)r   N)rz   typingr   r   torch._utilsr   r   r   r   r   r	   
torch.cudar
   r   r*   rT   rl   rt   r   r   r   r   <module>r      sB                               44 4 4 4 4 4BH H H H(0 0 0 0f,9 ,9 ,9 ,9^/GPT /G /G /G /G /Gd*7D *7 *7 *7 *7 *7 *7 *7r   