
    Χg                        d dl mZmZ d dlZd dlmZ d dlmZmZ d dl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZ g d
Z G d de          Z G d de          Z G d dee          Z G d de          Z G d dee          Z G d de          Z G d dee          Z G d de          Z G d dee          Z G d de          ZdS )    )AnyOptionalN)Tensor)
functionalinit)	ParameterUninitializedBufferUninitializedParameter   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr   c                        e Zd ZU dZdZg dZeed<   eed<   e	e         ed<   e
ed<   e
ed<   	 	 	 	 	 	 ddedede	e         de
de
ddf fdZddZddZd Zd Z fdZ xZS )	_NormBasez,Common base of _InstanceNorm and _BatchNorm.   )track_running_statsmomentumepsnum_featuresaffiner   r   r   r   r   h㈵>皙?TNreturnc           
         ||d}t                                                       || _        || _        || _        || _        || _        | j        rIt          t          j	        |fi |          | _
        t          t          j	        |fi |          | _        n,|                     dd            |                     dd            | j        r|                     dt          j        |fi |           |                     dt          j        |fi |           |  |  |                     dt          j        	 d
dt          j        id	 |                                D                        |  nB|                     dd            |                     dd            |                     dd            |                                  d S )Ndevicedtypeweightbiasrunning_meanrunning_varnum_batches_trackedr   r#   c                 &    i | ]\  }}|d k    ||S r#    .0kvs      V/var/www/html/ai-engine/env/lib/python3.11/site-packages/torch/nn/modules/batchnorm.py
<dictcomp>z&_NormBase.__init__.<locals>.<dictcomp>K   s#    OOO1!w,,q!,,,    r   )super__init__r   r   r   r   r   r   torchemptyr$   r%   register_parameterregister_bufferzerosonestensorlongitemsreset_parameters
selfr   r   r   r   r   r"   r#   factory_kwargs	__class__s
            r0   r5   z_NormBase.__init__&   s    %+U;;( #6 ; 	2#EK$O$O$O$OPPDK!%+l"M"Mn"M"MNNDII##Hd333##FD111# 	>  L K KN K K     uz,II.II   /.  % * PO(<(<(>(>OOO    66  666  555  !6===r2   c                     | j         rN| j                                         | j                            d           | j                                         d S d S Nr   )r   r&   zero_r'   fill_r(   rA   s    r0   reset_running_statsz_NormBase.reset_running_statsU   s`    # 	- ##%%%""1%%%$**,,,,,	- 	-r2   c                     |                                   | j        r4t          j        | j                   t          j        | j                   d S d S N)rI   r   r   ones_r$   zeros_r%   rH   s    r0   r?   z_NormBase.reset_parameters]   sR      """; 	#Jt{###K	"""""	# 	#r2   c                     t           rK   )NotImplementedErrorrA   inputs     r0   _check_input_dimz_NormBase._check_input_dimc   s    !!r2   c                 &     dj         di | j        S )Nzj{num_features}, eps={eps}, momentum={momentum}, affine={affine}, track_running_stats={track_running_stats}r+   )format__dict__rH   s    r0   
extra_reprz_NormBase.extra_reprf   s1    ? 88>P PAEP P	
r2   c           	      X   |                     dd           }||dk     rc| j        r\|dz   }	|	|vrS| j        )| j        j        t	          j        d          k    r| j        nt	          j        dt          j                  ||	<   t                                          |||||||           d S )Nversionr   r(   metar   r*   )	getr   r(   r"   r6   r<   r=   r4   _load_from_state_dict)rA   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrX   num_batches_tracked_keyrC   s             r0   r[   z_NormBase._load_from_state_dictl   s     !$$Y55Ow{{0H{ '-/D&D#&j88 /;075<;O;OOO ,, auz:::	 23 	%%	
 	
 	
 	
 	
r2   r   r   TTNNr   N)__name__
__module____qualname____doc___version__constants__int__annotations__floatr   boolr5   rI   r?   rR   rV   r[   __classcell__rC   s   @r0   r   r      sF        66HXXXM	JJJuoLLL $'$(-  - -  -  5/	- 
 -  "-  
-  -  -  -  -  - ^- - - -# # # #" " "
 
 
 
  
  
  
  
  
  
  
  
r2   r   c                   b     e Zd Z	 	 	 	 	 	 ddededee         ded	ed
df fdZded
efdZ	 xZ
S )
_BatchNormr   r   TNr   r   r   r   r   r   c                 N    ||d} t                      j        |||||fi | d S Nr!   )r4   r5   r@   s
            r0   r5   z_BatchNorm.__init__   sP     %+U;;#x1D	
 	
HV	
 	
 	
 	
 	
r2   rQ   c           
         |                      |           | j        d}n| j        }| j        rN| j        rG| j        @| j                            d           | j        dt          | j                  z  }n| j        }	 | j        rd}n| j        d u o| j        d u }	 t          j
        || j        r| j        r| j        nd | j        r| j        r| j        nd | j        | j        ||| j                  S )N        r         ?T)rR   r   trainingr   r(   add_rn   r&   r'   F
batch_normr$   r%   r   )rA   rQ   exponential_average_factorbn_trainings       r0   forwardz_BatchNorm.forward   s1   e$$$
 = ),&&)-&= 	?T5 	?'3(--a000=(14uT=U7V7V1V..15.	 = 	UKK,4T4;Kt;SK	
 | =$($<D$(MWT5MWDSWKI&H
 
 	
r2   rd   )rf   rg   rh   rl   rn   r   ro   r5   r   r   rp   rq   s   @r0   rs   rs      s         $'$(
 

 
 5/	

 
 "
 

 
 
 
 
 
.
V .
 .
 .
 .
 .
 .
 .
 .
 .
r2   rs   c                   Z     e Zd ZU eed<   eed<   	 	 	 	 	 	 d	 d fdZd fd	Zdd
Z xZS )_LazyNormBaser$   r%   r   r   TNr   c                    ||d} t                      j        d||ddfi | || _        || _        | j        r"t	          di || _        t	          di || _        | j        rct          di || _        t          di || _	        t          j        	 ddt          j        id |                                D             | _        d S d S )Nr!   r   Fr#   c                 &    i | ]\  }}|d k    ||S r*   r+   r,   s      r0   r1   z*_LazyNormBase.__init__.<locals>.<dictcomp>   s#    KKKDAqa7ll1alllr2   r+   r3   )r4   r5   r   r   r
   r$   r%   r	   r&   r'   r6   r<   r=   r>   r(   )	rA   r   r   r   r   r"   r#   rB   rC   s	           r0   r5   z_LazyNormBase.__init__   s%    %+U;; 		
 		
 		
 		
 		
 #6 ; 	A0BB>BBDK.@@@@DI# 	 3 E En E ED2DD^DDD',|( (j( LKN$8$8$:$:KKK( (D$$$	 	r2   c                     |                                  s-| j        dk    r$t                                                       d S d S d S )Nr   )has_uninitialized_paramsr   r4   r?   )rA   rC   s    r0   r?   z_LazyNormBase.reset_parameters   sP    ,,.. 	'43D3I3IGG$$&&&&&	' 	'3I3Ir2   c                 
   |                                  r|j        d         | _        | j        rxt	          | j        t                    sJ t	          | j        t                    sJ | j                            | j        f           | j                            | j        f           | j	        r@| j
                            | j        f           | j                            | j        f           |                                  d S d S rE   )r   shaper   r   
isinstancer$   r
   r%   materializer   r&   r'   r?   rP   s     r0   initialize_parametersz#_LazyNormBase.initialize_parameters   s   ((** 	$ %AD{ <!$+/EFFFFF!$)-CDDDDD''):(<===	%%t'8&:;;;' !--&(    ,,&(   !!#####	$ 	$r2   rd   re   )	rf   rg   rh   r
   rm   r5   r?   r   rp   rq   s   @r0   r   r      s         """"
        
           D' ' ' ' ' '$ $ $ $ $ $ $ $r2   r   c                       e Zd ZdZd ZdS )r   a  Applies Batch Normalization over a 2D or 3D input.

    Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the number of features or channels of the input). By default, the
    elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
    At train time in the forward pass, the standard-deviation is calculated via the biased estimator,
    equivalent to ``torch.var(input, unbiased=False)``. However, the value stored in the
    moving average of the standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, unbiased=True)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

    Args:
        num_features: number of features or channels :math:`C` of the input
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
          :math:`C` is the number of features or channels, and :math:`L` is the sequence length
        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm1d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm1d(100, affine=False)
        >>> input = torch.randn(20, 100)
        >>> output = m(input)
    c                     |                                 dk    r=|                                 dk    r't          d|                                  d          d S d S Nr      zexpected 2D or 3D input (got D input)dim
ValueErrorrP   s     r0   rR   zBatchNorm1d._check_input_dimS  W    99;;!		q 0 0RUYY[[RRRSSS  0 0r2   Nrf   rg   rh   ri   rR   r+   r2   r0   r   r     s5        D DLT T T T Tr2   r   c                       e Zd ZdZeZd ZdS )r   aR  A :class:`torch.nn.BatchNorm1d` module with lazy initialization.

    Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                     |                                 dk    r=|                                 dk    r't          d|                                  d          d S d S r   r   rP   s     r0   rR   z LazyBatchNorm1d._check_input_dimu  r   r2   N)rf   rg   rh   ri   r   cls_to_becomerR   r+   r2   r0   r   r   X  s9         4  MT T T T Tr2   r   c                       e Zd ZdZd ZdS )r   a  Applies Batch Normalization over a 4D input.

    4D is a mini-batch of 2D inputs
    with additional channel dimension. Method described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, unbiased=True)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, H, W)`
        - Output: :math:`(N, C, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm2d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm2d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45)
        >>> output = m(input)
    c                     |                                 dk    r%t          d|                                  d          d S N   zexpected 4D input (got r   r   rP   s     r0   rR   zBatchNorm2d._check_input_dim  ?    99;;!Luyy{{LLLMMM r2   Nr   r+   r2   r0   r   r   z  5        E ENN N N N Nr2   r   c                       e Zd ZdZeZd ZdS )r   aU  A :class:`torch.nn.BatchNorm2d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                     |                                 dk    r%t          d|                                  d          d S r   r   rP   s     r0   rR   z LazyBatchNorm2d._check_input_dim  r   r2   N)rf   rg   rh   ri   r   r   rR   r+   r2   r0   r   r     9         4  MN N N N Nr2   r   c                       e Zd ZdZd ZdS )r   a  Applies Batch Normalization over a 5D input.

    5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over
    the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
    of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
    to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
    standard-deviation is calculated via the biased estimator, equivalent to
    ``torch.var(input, unbiased=False)``. However, the value stored in the moving average of the
    standard-deviation is calculated via the unbiased  estimator, equivalent to
    ``torch.var(input, unbiased=True)``.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done over the `C` dimension, computing statistics
    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
    or Spatio-temporal Batch Normalization.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, D, H, W)`
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``

    Shape:
        - Input: :math:`(N, C, D, H, W)`
        - Output: :math:`(N, C, D, H, W)` (same shape as input)

    Examples::

        >>> # With Learnable Parameters
        >>> m = nn.BatchNorm3d(100)
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)
    c                     |                                 dk    r%t          d|                                  d          d S N   zexpected 5D input (got r   r   rP   s     r0   rR   zBatchNorm3d._check_input_dim1  r   r2   Nr   r+   r2   r0   r   r     r   r2   r   c                       e Zd ZdZeZd ZdS )r   aU  A :class:`torch.nn.BatchNorm3d` module with lazy initialization.

    Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
    from the ``input.size(1)``.
    The attributes that will be lazily initialized are `weight`, `bias`,
    `running_mean` and `running_var`.

    Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
    on lazy modules and their limitations.

    Args:
        eps: a value added to the denominator for numerical stability.
            Default: 1e-5
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
    c                     |                                 dk    r%t          d|                                  d          d S r   r   rP   s     r0   rR   z LazyBatchNorm3d._check_input_dimS  r   r2   N)rf   rg   rh   ri   r   r   rR   r+   r2   r0   r   r   6  r   r2   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddededee         d	ed
edee         ddf fdZ	d Z
d ZdedefdZedd            Z xZS )r   a  Applies Batch Normalization over a N-Dimensional input.

    The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
    `Batch Normalization: Accelerating Deep Network Training by Reducing
    Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

    .. math::

        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

    The mean and standard-deviation are calculated per-dimension over all
    mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
    are learnable parameter vectors of size `C` (where `C` is the input size).
    By default, the elements of :math:`\gamma` are sampled from
    :math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
    The standard-deviation is calculated via the biased estimator, equivalent to
    `torch.var(input, unbiased=False)`.

    Also by default, during training this layer keeps running estimates of its
    computed mean and variance, which are then used for normalization during
    evaluation. The running estimates are kept with a default :attr:`momentum`
    of 0.1.

    If :attr:`track_running_stats` is set to ``False``, this layer then does not
    keep running estimates, and batch statistics are instead used during
    evaluation time as well.

    .. note::
        This :attr:`momentum` argument is different from one used in optimizer
        classes and the conventional notion of momentum. Mathematically, the
        update rule for running statistics here is
        :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
        where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
        new observed value.

    Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
    statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
    Normalization or Spatio-temporal Batch Normalization.

    Currently :class:`SyncBatchNorm` only supports
    :class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
    :meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
    :attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
    Network with DDP.

    Args:
        num_features: :math:`C` from an expected input of size
            :math:`(N, C, +)`
        eps: a value added to the denominator for numerical stability.
            Default: ``1e-5``
        momentum: the value used for the running_mean and running_var
            computation. Can be set to ``None`` for cumulative moving average
            (i.e. simple average). Default: 0.1
        affine: a boolean value that when set to ``True``, this module has
            learnable affine parameters. Default: ``True``
        track_running_stats: a boolean value that when set to ``True``, this
            module tracks the running mean and variance, and when set to ``False``,
            this module does not track such statistics, and initializes statistics
            buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
            When these buffers are ``None``, this module always uses batch statistics.
            in both training and eval modes. Default: ``True``
        process_group: synchronization of stats happen within each process group
            individually. Default behavior is synchronization across the whole
            world

    Shape:
        - Input: :math:`(N, C, +)`
        - Output: :math:`(N, C, +)` (same shape as input)

    .. note::
        Synchronization of batchnorm statistics occurs only while training, i.e.
        synchronization is disabled when ``model.eval()`` is set or if
        ``self.training`` is otherwise ``False``.

    Examples::

        >>> # xdoctest: +SKIP
        >>> # With Learnable Parameters
        >>> m = nn.SyncBatchNorm(100)
        >>> # creating process group (optional)
        >>> # ranks is a list of int identifying rank ids.
        >>> ranks = list(range(8))
        >>> r1, r2 = ranks[:4], ranks[4:]
        >>> # Note: every rank calls into new_group for every
        >>> # process group created, even if that rank is not
        >>> # part of the group.
        >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
        >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
        >>> # Without Learnable Parameters
        >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
        >>> input = torch.randn(20, 100, 35, 45, 10)
        >>> output = m(input)

        >>> # network is nn.BatchNorm layer
        >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
        >>> # only single gpu per process is currently supported
        >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
        >>>                         sync_bn_network,
        >>>                         device_ids=[args.local_rank],
        >>>                         output_device=args.local_rank)
    r   r   TNr   r   r   r   r   process_groupr   c	                 \    ||d}	 t                      j        |||||fi |	 || _        d S ru   )r4   r5   r   )rA   r   r   r   r   r   r   r"   r#   rB   rC   s             r0   r5   zSyncBatchNorm.__init__  sV     %+U;;#x1D	
 	
HV	
 	
 	
 +r2   c                     |                                 dk     r%t          d|                                  d          d S )Nr   z expected at least 2D input (got r   r   rP   s     r0   rR   zSyncBatchNorm._check_input_dim  s<    99;;??U		UUUVVV ?r2   c                 V    |                     d          dk    rt          d          d S )Nr   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rP   s     r0   _check_non_zero_input_channelsz,SyncBatchNorm._check_non_zero_input_channels  s4    ::a==AK   r2   rQ   c                 R   |                      |           |                     |           | j        d}n| j        }| j        rU| j        rN| j        J | j                            d           | j        d| j                                        z  }n| j        }	 | j        rd}n| j        d u o| j	        d u }	 | j        r| j        r| j        nd }| j        r| j        r| j	        nd }|oB| j        o;t          j                                        ot          j                                        }|r|j        j        dt          j                                        fvr.t%          dt          j                                                   t          j        j        j        }| j        r| j        }t          j                            |          }|dk    }|s*t/          j        |||| j        | j        ||| j                  S |sJ t9          j        || j        | j        ||| j        |||	  	        S )Nrw   r   rx   Tcudaz4SyncBatchNorm expected input tensor to be on GPU or )rR   r   r   ry   r   r(   rz   itemr&   r'   r6   distributedis_availableis_initializedr"   type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizer{   r|   r$   r%   r   sync_batch_normapply)	rA   rQ   r}   r~   r&   r'   	need_syncr   
world_sizes	            r0   r   zSyncBatchNorm.forward  s   e$$$++E222
 = ),&&)-&= 	;T5 	;+777$))!,,,}$-043K3P3P3R3R-R**-1]*	 = 	UKK,4T4;Kt;SK	 &*]Xd6NXDTX 	 %)MWT5MWDSW 	  33!..003 !0022	 	  	'| 6688)   !Bx==??B B  
 "-39M! 3 $ 2*99-HHJ"QI  	<	*	 	 	 "(	*
 
 
r2   c                    |}t          |t          j        j        j        j                  rt          j                            |j        |j        |j	        |j
        |j        |          }|j
        rCt          j                    5  |j        |_        |j        |_        ddd           n# 1 swxY w Y   |j        |_        |j        |_        |j        |_        |j        |_        t'          |d          r|j        |_        |                                D ]/\  }}|                    ||                     ||                     0~|S )aa  Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.

        Args:
            module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
            process_group (optional): process group to scope synchronization,
                default is the whole world

        Returns:
            The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
            layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
            a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
            instead.

        Example::

            >>> # Network with nn.BatchNorm layer
            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
            >>> module = torch.nn.Sequential(
            >>>            torch.nn.Linear(20, 100),
            >>>            torch.nn.BatchNorm1d(100),
            >>>          ).cuda()
            >>> # creating process group (optional)
            >>> # ranks is a list of int identifying rank ids.
            >>> ranks = list(range(8))
            >>> r1, r2 = ranks[:4], ranks[4:]
            >>> # Note: every rank calls into new_group for every
            >>> # process group created, even if that rank is not
            >>> # part of the group.
            >>> # xdoctest: +SKIP("distributed")
            >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
            >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
            >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

        Nqconfig)r   r6   nnmodules	batchnormrs   r   r   r   r   r   r   no_gradr$   r%   r&   r'   r(   ry   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechilds         r0   r   z$SyncBatchNorm.convert_sync_batchnorm6  s   H feh.8CDD 	7!H22#
* M } 5]__ 5 5+1=M()/M&5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 *0)<M&(.(:M%060JM-%+_M"vy)) 7(.%!0022 	 	KD%$$c00FF    s   B((B,/B,)r   r   TTNNNrK   )rf   rg   rh   ri   rl   rn   r   ro   r   r5   rR   r   r   r   classmethodr   rp   rq   s   @r0   r   r   X  s       d dR $'$('++ ++ + 5/	+
 + "+  }+ 
+ + + + + +"W W W  ZV Z Z Z Z Zx < < < [< < < < <r2   r   )typingr   r   r6   r   torch.nnr   r{   r   torch.nn.parameterr   r	   r
   
_functionsr   r   lazyr   r   r   __all__r   rs   r   r   r   r   r   r   r   r+   r2   r0   <module>r      s                          * * * * * * * * U U U U U U U U U U 8 8 8 8 8 8 ! ! ! ! ! !        s
 s
 s
 s
 s
 s
 s
 s
l>
 >
 >
 >
 >
 >
 >
 >
B9$ 9$ 9$ 9$ 9$OY 9$ 9$ 9$xIT IT IT IT IT* IT IT ITXT T T T TmZ T T TDJN JN JN JN JN* JN JN JNZN N N N NmZ N N NDJN JN JN JN JN* JN JN JNZN N N N NmZ N N ND[ [ [ [ [J [ [ [ [ [r2   