
    g5                         d Z ddlmZ ddlmZ  ej        e          Z G d de          Z G d de          Z	 G d d	e          Z
 G d
 de          ZdS )zSAM model configuration   )PretrainedConfig)loggingc                   2     e Zd ZdZ	 	 	 	 	 	 	 d	 fd	Z xZS )
SamPromptEncoderConfiga  
    This is the configuration class to store the configuration of a [`SamPromptEncoder`]. The [`SamPromptEncoder`]
    module is used to encode the input 2D points and bounding boxes. Instantiating a configuration defaults will yield
    a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        image_size (`int`, *optional*, defaults to 1024):
            The expected output resolution of the image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        mask_input_channels (`int`, *optional*, defaults to 16):
            The number of channels to be fed to the `MaskDecoder` module.
        num_point_embeddings (`int`, *optional*, defaults to 4):
            The number of point embeddings to be used.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the encoder and pooler.
                geluư>c                      t                      j        di | || _        || _        || _        ||z  | _        || _        || _        || _        || _	        d S N )
super__init__hidden_size
image_size
patch_sizeimage_embedding_sizemask_input_channelsnum_point_embeddings
hidden_actlayer_norm_eps)
selfr   r   r   r   r   r   r   kwargs	__class__s
            e/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/sam/configuration_sam.pyr   zSamPromptEncoderConfig.__init__1   sj     	""6"""&$$$.*$<!#6 $8!$,    )r   r   r	   r	   r
   r   r   __name__
__module____qualname____doc__r   __classcell__r   s   @r   r   r      s^         4 - - - - - - - - - -r   r   c                   8     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 d
 fd		Z xZS )SamMaskDecoderConfiga  
    This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
    will yield a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            The non-linear activation function used inside the `SamMaskDecoder` module.
        mlp_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsampling rate of the attention layer.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The number of layers in the IoU head module.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The dimensionality of the hidden states in the IoU head module.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.

    r   relu         r   r   c                      t                      j        di | || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        d S r   )r   r   r   r   mlp_dimnum_hidden_layersnum_attention_headsattention_downsample_ratenum_multimask_outputsiou_head_depthiou_head_hidden_dimr   )r   r   r   r-   r.   r/   r0   r1   r2   r3   r   r   r   s               r   r   zSamMaskDecoderConfig.__init__i   sv     	""6"""&$!2#6 )B&%:",#6 ,r   )
r   r(   r)   r*   r+   r*   r   r   r   r   r   r%   s   @r   r'   r'   G   sh         F "#- - - - - - - - - -r   r'   c                   N     e Zd ZdZddddddddd	d
ddddddg dddf fd	Z xZS )SamVisionConfiga  
    This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of the SAM ViT-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        output_channels (`int`, *optional*, defaults to 256):
            Dimensionality of the output channels in the Patch Encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input image.
        image_size (`int`, *optional*, defaults to 1024):
            Expected resolution. Target size of the resized input image.
        patch_size (`int`, *optional*, defaults to 16):
            Size of the patches to be extracted from the input image.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string)
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 1e-10):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to query, key, value projections.
        mlp_ratio (`float`, *optional*, defaults to 4.0):
            Ratio of mlp hidden dim to embedding dim.
        use_abs_pos (`bool`, *optional*, defaults to `True`):
            Whether to use absolute position embedding.
        use_rel_pos (`bool`, *optional*, defaults to `True`):
            Whether to use relative position embedding.
        window_size (`int`, *optional*, defaults to 14):
            Window size for relative position.
        global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
            The indexes of the global attention layers.
        num_pos_feats (`int`, *optional*, defaults to 128):
            The dimensionality of the position embedding.
        mlp_dim (`int`, *optional*):
            The dimensionality of the MLP layer in the Transformer encoder. If `None`, defaults to `mlp_ratio *
            hidden_size`.
    i   r      r   r   r	   r   r   g        g|=Tg      @   )r*      r+         Nc                 l    t                      j        di | || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        |t)          ||z            n|| _        d S r   )r   r   r   output_channelsr.   r/   num_channelsr   r   r   r   attention_dropoutinitializer_rangeqkv_bias	mlp_ratiouse_abs_posuse_rel_poswindow_sizeglobal_attn_indexesnum_pos_featsintr-   )r   r   r<   r.   r/   r=   r   r   r   r   r>   r?   r@   rA   rB   rC   rD   rE   rF   r-   r   r   s                        r   r   zSamVisionConfig.__init__   s    . 	""6"""&.!2#6 ($$$,!2!2 "&&&#6 *7>s;2333Gr   r   r%   s   @r   r5   r5      s        1 1j )MM)+T +T +T +T +T +T +T +T +T +Tr   r5   c                   0     e Zd ZdZdZ	 	 	 	 d fd	Z xZS )	SamConfiga  
    [`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
    SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].

        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamPromptEncoderConfig,
    ...     SamMaskDecoderConfig,
    ...     SamModel,
    ... )

    >>> # Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamConfig()

    >>> # Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig

    >>> # Initializing SAM vision, SAM Q-Former and language model configurations
    >>> vision_config = SamVisionConfig()
    >>> prompt_encoder_config = SamPromptEncoderConfig()
    >>> mask_decoder_config = SamMaskDecoderConfig()

    >>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```samN{Gz?c                     t                      j        di | ||ni }||ni }||ni }t          |t                    r|                                }t          |t
                    r|                                }t          |t                    r|                                }t          di || _        t          di || _        t          di || _	        || _
        d S r   )r   r   
isinstancer5   to_dictr   r'   vision_configprompt_encoder_configmask_decoder_configr?   )r   rO   rP   rQ   r?   r   r   s         r   r   zSamConfig.__init__  s    	""6""")6)B9N9Z 5 5`b5H5T11Z\m_55 	4)1133M+-CDD 	D$9$A$A$C$C!)+?@@ 	@"5"="="?"?,==}==%;%T%T>S%T%T"#7#N#N:M#N#N !2r   )NNNrK   )r    r!   r"   r#   
model_typer   r$   r%   s   @r   rI   rI      s]        / /b J " 3 3 3 3 3 3 3 3 3 3r   rI   N)r#   configuration_utilsr   utilsr   
get_loggerr    loggerr   r'   r5   rI   r   r   r   <module>rW      s     3 3 3 3 3 3       
	H	%	%,- ,- ,- ,- ,-- ,- ,- ,-^:- :- :- :- :-+ :- :- :-z_T _T _T _T _T& _T _T _TDK3 K3 K3 K3 K3  K3 K3 K3 K3 K3r   