
    gf                        d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZmZm Z  ddl!m"Z"  ej#        e$          Z% e             rddl&m'Z' ndZ' e            rddl(m)Z)m*Z* ddl+m,Z, nd\  Z,Z*Z) e            r	ddl-m.Z.m/Z/ nd\  Z/Z. e0e,e*e.e/e)f          Z1dZ2dZ3 G d dej4                  Z5 G d dej4                  Z6 G d dej4                  Z7 G d de          Z8e G d  d!e                      Z9e G d" d#e                      Z:d$Z;d%Z< ed&e;           G d' d(e8                      Z= ed)e;           G d* d+e8e                      Z>dS ),zPyTorch MAMBA model.    N)	dataclass)AnyDictOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)
MambaCache)GenerationMixin)PreTrainedModel)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging)is_causal_conv1d_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNN)causal_conv1d_fncausal_conv1d_update)NNzstate-spaces/mamba-130m-hfr   c            
       0    e Zd ZdZdedef fdZ	 	 	 ddej        de	e
         de	ej                 d	e	ej                 fd
Zdde	e
         de	ej                 d	e	ej                 fdZ	 	 	 dde	e
         de	ej                 d	e	ej                 fdZ xZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    config	layer_idxc           	         t                                                       || _        |j        | _        |j        | _        |j        | _        |j        | _        t          |j
                  | _
        || _        |j        | _        t          j        | j        | j        |j        |j        | j        |j        dz
            | _        |j        | _        t$          |j                 | _        |j        | _        t          j        | j        | j        dz  |j                  | _        t          j        | j        | j
        | j        dz  z   d          | _        t          j        | j
        | j        d          | _        t5          j        d| j        dz   t4          j                  d d d f         }|                    | j        d                                          }t          j        t5          j         |                    | _!        t          j        t5          j"        | j                            | _#        t          j        | j        | j        |j                  | _$        |j        | _        tJ          s\| j        r9tM                      rtN          (                    d	           d S tS          d
          tN          (                    d           d S d S )Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   r(   FTdtypea7  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)*super__init__r#   hidden_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeintermediate_sizeinttime_step_rankr$   use_conv_biasr	   Conv1dconv1d
hidden_act
activationr   actuse_mambapyLinearuse_biasin_projx_projdt_projtorcharangefloat32expand
contiguous	ParameterlogA_logonesDout_projis_fast_path_availabler   loggerwarning_onceImportError)selfr#   r$   A	__class__s       d/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/mamba/modeling_mamba.pyr2   zMambaMixer.__init__K   s   !-$/ & 2!'!9!&"788"#1i./%*)&*
 
 
 !+&+,!- y!143IA3MTZTcdddi 68KdNadeNe8elqrrry!4d6LSWXXX LD/!35=III$PQPQPQ'RHHT+R00;;==\%)A,,//
ej)?@@AA	$"8$:JQWQ`aaa% 	 ')) 	''F     & Z   ##J    	 	    Nhidden_statescache_paramscache_positionattention_maskc                 	   |                      |                              dd          }| j        r|t          || j        j        | j        r| j        j        nd | j        j        | j	        j        | j
        j        | j        r| j
        j                                        nd t          j        | j                                                   d d | j                                        | j	        j                                        d          }ny|                    dd          \  }}|||                    d          z  }| j        j                            | j        j                            d          | j        j                            d                    }|g|d         dk    r[t+          |                    d          |j        | j                 || j        j        | j                  }|                    d          }nu|Qt4          j                            || j        |j        d         z
  df          }	|                    | j        |	|           tA          ||| j        j        | j                  }|||                    d          z  }|                     |                    dd                    }
t          j!        |
| j"        | j#        | j#        gd          \  }}}| j	        j        |                    dd          z  }t          j        | j                                                   }tI          | j	        d	          r| j	        j                                        nd }|t|d         dk    rhtK          |j&        | j                 |d
         |d
         ||d d df         |d d df         | j        |d
         |d
  
                            d          }nztO          ||||                    dd          |                    dd          | j                                        ||dd
  
        \  }}|||(                    | j        |           | 
                    |                    dd                    }|S )Nr   r,   T)
delta_biasdelta_softplusdimr   r0   )r?   r(   ).r   )dt_softplus)ra   return_last_state))rD   	transposetrainingr   r=   weightr;   r(   rE   rF   rQ   rC   floatrG   exprN   rP   chunk	unsqueezeviewsizer    squeezeconv_statesr$   r?   r	   
functionalpadr7   shapeupdate_conv_stater   splitr:   r5   hasattrr   
ssm_statesr   update_ssm_state)rV   r[   r\   r]   r^   projected_statescontextualized_statesgateconv_weightsrp   ssm_parameters	time_stepBCdiscrete_time_steprW   time_proj_biasscan_outputs	ssm_states                      rY   cuda_kernels_forwardzMambaMixer.cuda_kernels_forward   sr     <<66@@AFF= X	P\1$2 "$($6@  D"#$.2mE"((***4:++--...<,2244#% % %!!" #3"8"8"8"B"BM4) -0H0H0K0K K  ;-224;3E3J3J13M3Mt{OaOfOfghOiOijjL'N1,=,A,A 4!))"-- ,T^< K$O! ! !. 7 7 ; ;+"$-"3"3%(=@STV@W(WYZ'[# #K !224>;P^___ 0!<1Ado! ! ! ) -0H0H0K0K K "[[)@)@A)F)FGGN#k!4d6I4K^ _eg  OIq! "&!4y7J7J1a7P7P!P4:++--...A:A$,PV:W:WaT\.44666]aN'N1,=,A,A5 +DN;!&)&v.aaadGaaadGFL" $      )B--  +<!&KK1%%KK1%%FLLNN"#'&*+ + +'i (\-E 11$.)LLL %)MM,2H2HA2N2N$O$O!$$rZ   c           	      d   |j         \  }}}|j        }|                     |                              dd          }	|	                    dd          \  }
}||
|                    d          z  }
|v|j        | j                                                 }|	                    |
j
                  }|j         d         | j        k    rt          j                            |
| j        |
j         d         z
  df          }|                    | j        ||           |                     |                     |
          dd |f                   }
n|                    | j        |
|          }t%          j        || j        j        d d dd d f         z  d          }
| j        r|
| j        j        z  }
|                     |
          	                    |                              d          }
n[t%          j        || j        | j        f|
j
        |          }|                     |                     |
          dd |f                   }
||
|                    d          z  }
|                     |
                    dd                    }t%          j        || j        | j        | j        gd          \  }}}|                     |          }t          j                            |                              dd          }t%          j        | j         !                                           }t%          j        |d d d d d d f         |d d d d d d d f         z            }|d d d d d d d f         |d d d d d d d f         !                                z  }||
d d d d d d d f         !                                z  }| j"        r| j#        r|tI          |                    dd          |                    dd                    }||                    d          z  %                    d                              dd          }||
| j&        d d d d f         z  z   }||                     |          z  }ng }tO          |          D ]}|d d d d |d d f         |z  |d d d d |d d f         z   }t%          j(        |	                    |          |d d |d d f                             d                    }|)                    |d d d d df                    t%          j*        |d          }||
| j&        d d d d f         z  z   }||                     |          z  }|%|j        | j                 +                    |           | ,                    |                    dd                    }|S )	Nr   r,   rb   r   r0   .devicer/   r   )-rs   r/   rD   rf   rk   rl   rw   r$   clonetor   r7   r	   rq   rr   rt   r@   r=   rG   sumrh   r;   r(   zerosr8   r5   rE   ru   r:   rF   softplusrj   rN   ri   rA   rg   r   ro   rP   rangematmulappendstackcopy_rQ   )rV   input_statesr\   r]   r^   
batch_sizeseq_len_r/   ry   r[   r{   r   
conv_stater}   r~   r   r   r   rW   
discrete_A
discrete_BdeltaB_uhsscan_outputr   irz   s                               rY   slow_forwardzMambaMixer.slow_forward   s   !-!3
GQ"<<55??1EE.44QA4>>t%)N,D,DQ,G,GGM #$/?EEGGI!]%9::I #A&$*???]..!*]-@-DDaH 

 ..t~z>ZZZ $])C)CC'M)R S S);;DNM[ijj
 %	*t{7I!!!QPQPQPQ'7R*RXZ [ [ [% 6!T[%55M $ 7 7 : :5 A A K KB O OT3T5HI$+5  I !HHT[[%?%?XgX%NOOM%)N,D,DQ,G,GGM ]%<%<Q%B%BCC+T0$2EtGZ[ac
 
 
	1a "\\)44]334FGGQQRSUVWW Ytz''))***Yqqqq$!125G111aaaQU5VVWW
'111aaa6111dAAAqqq=9I9O9O9Q9QQ
aaaAAAtm < B B D DD  	I 	I,2Fz++Aq1183E3Ea3K3KLLBB/88;;EEaKKK%tQQQ}8M(MMK%6KKL7^^ : :&qqq!!!Qz2Y>!!!QQQPQSTSTST*AUU	#l9<<+>+>!!!Q'
@T@TUW@X@XYY##K111a$89999+l;;;K%aaa9N)NOK&$7K''7==iHHH !%k.C.CAq.I.I J J$$rZ   c                     t           rNd| j        j        j        j        v r6t
          j                                        s|                     ||||          S | 	                    ||||          S )Ncuda)
rR   rE   rh   r   typerG   _dynamois_compilingr   r   )rV   r[   r\   r]   r^   s        rY   forwardzMambaMixer.forward>  sl     " 	jf0B0I0N&N&NW\WdWqWqWsWs&N,,]L.Zhiii  nn]]]rZ   r   )__name__
__module____qualname____doc__r   r9   r2   rG   Tensorr   r   
LongTensorr   r   r   __classcell__rX   s   @rY   r"   r"   C   s        :{ :s : : : : : :~ .25959c% c%|c% z*c% !!12	c%
 !!12c% c% c% c%LN% N%x
7K N%aijojza{ N%  S[  \a  \l  Sm N% N% N% N%h .25959	^ 	^ z*	^ !!12		^
 !!12	^ 	^ 	^ 	^ 	^ 	^ 	^ 	^rZ   r"   c                   ,     e Zd Zd fd	Zd Zd Z xZS )MambaRMSNormư>c                     t                                                       t          j        t	          j        |                    | _        || _        dS )zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)r1   r2   r	   rL   rG   rO   rh   variance_epsilon)rV   r3   epsrX   s      rY   r2   zMambaRMSNorm.__init__K  sD     	l5:k#:#:;; #rZ   c                    |j         }|                    t          j                  }|                    d                              dd          }|t          j        || j        z             z  }| j        |                    |          z  S )Nr,   r0   T)keepdim)	r/   r   rG   rI   powmeanrsqrtr   rh   )rV   r[   input_dtypevariances       rY   r   zMambaRMSNorm.forwardS  s|    #)%((77 $$Q'',,R,>>%Ht?T4T(U(UU{]--k::::rZ   c                 :    | j         j        d          d| j         S )Nr   z, eps=)rh   rs   r   rV   s    rY   
extra_reprzMambaRMSNorm.extra_reprZ  s#    +#A&EEd.CEEErZ   )r   )r   r   r   r2   r   r   r   r   s   @rY   r   r   J  sb        $ $ $ $ $ $; ; ;F F F F F F FrZ   r   c                   r     e Zd Z fdZ	 	 	 ddee         deej                 deej                 fdZ xZ	S )
MambaBlockc                     t                                                       || _        || _        |j        | _        t          |j        |j                  | _        t          ||          | _
        d S )Nr   r$   )r1   r2   r#   r$   residual_in_fp32r   r3   layer_norm_epsilonnormr"   mixer)rV   r#   r$   rX   s      rY   r2   zMambaBlock.__init___  sd    " & 7 !39RSSS	)<<<


rZ   Nr\   r]   r^   c                    |}|                      |                    | j         j        j                            }| j        r|                    t
          j                  }|                     ||||          }||z   }|S )Nr.   r\   r]   r^   )r   r   rh   r/   r   rG   rI   r   )rV   r[   r\   r]   r^   residuals         rY   r   zMambaBlock.forwardg  s     !		-"2"29I9O"2"P"PQQ  	2{{5=11H

^dr # 
 
 !=0rZ   r   )
r   r   r   r2   r   r   rG   r   r   r   r   s   @rY   r   r   ^  s        = = = = = .25959  z* !!12	
 !!12       rZ   r   c                   0    e Zd ZdZeZdZddgZdZdZ	d Z
dS )MambaPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    backboner   r"   Tc                    t          |t                    rd|j        _        d|j        _        | j        j        dz  | j        j        z  }| j        j        dk    r+t          j
                            |j        j        |           n<| j        j        dk    r,t          j
                            |j        j        | |           t          j        t          j        | j        j                  t'          j        | j        j                  t'          j        | j        j                  z
  z  t'          j        | j        j                  z                                 | j        j                  }|t          j        t          j        |                      z   }t          j                    5  |j        j                            |           ddd           n# 1 swxY w Y   d|j        j        _        t          |t          j                  rB|j        :t?          |j        dd          s$t          j
                             |j                   nJt          |t          j!                  r0t          j
        "                    |j        | j        j#        	           | j        j$        r|%                                D ]\  }}|d
v rt          j
        &                    |t'          j'        d                     t          j                    5  |t'          j'        | j        j(                  z  }ddd           n# 1 swxY w Y   dS dS )zInitialize the weights.Tg      constantrandom)minN
_no_reinitF)std)zout_proj.weight   )a))
isinstancer"   rN   _no_weight_decayrP   r#   r:   time_step_scaletime_step_init_schemer	   init	constant_rF   rh   uniform_rG   rj   randr8   mathrM   time_step_maxtime_step_minclamptime_step_floorexpm1no_gradr(   r   r   rB   getattrzeros_	Embeddingnormal_initializer_rangerescale_prenorm_residualnamed_parameterskaiming_uniform_sqrtnum_hidden_layers)rV   moduledt_init_stddtinv_dtnameps          rY   _init_weightsz"MambaPreTrainedModel._init_weights  sQ   fj)) 	2,0FL)(,FH%+4d:T[=XXK{0J>>!!&."7EEEE2h>>  !6kRRR
4;8998DK566$+B[9\9\\^(4;4556  e3e44	  %)U["%5%5$5666F 2 2#))&1112 2 2 2 2 2 2 2 2 2 2 2 2 2 2-1FN*fbi(( 	N{&v{L%@@ 0GNN6;///-- 	NGOOFMt{/LOMMM;/ 	F "2244 F Fa...
 G,,Q$)A,,,??? F FTYt{'DEEEF F F F F F F F F F F F F F F	F 	FF Fs$   + GGG"L66L:	=L:	N)r   r   r   r   r   config_classbase_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr    rZ   rY   r   r   z  sU         
 L"%|4&*#L-F -F -F -F -FrZ   r   c                       e Zd ZU dZdZeej                 ed<   dZ	ee
         ed<   dZeeej                          ed<   dS )MambaOutputa#  
    Class for the MAMBA model outputs.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cache_params (`MambaCache`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.

            Includes both the State space model state matrices after the selective scan, and the Convolutional states
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Nlast_hidden_stater\   r[   )r   r   r   r   r   r   rG   FloatTensor__annotations__r\   r   r[   r   r   rZ   rY   r   r     si          $ 6:x 12999)-L(:&---8<M8E%"345<<<<<rZ   r   c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
ee         ed<   dZeeej                          ed<   dS )MambaCausalLMOutputa  
    Base class for causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        cache_params (`MambaCache`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.

            Includes both the State space model state matrices after the selective scan, and the Convolutional states
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Nlosslogitsr\   r[   )r   r   r   r   r  r   rG   r   r   r  r\   r   r[   r   r   rZ   rY   r   r     s          ( )-D(5$
%,,,*.FHU&'...)-L(:&---8<M8E%"345<<<<<rZ   r   a?  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MambaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            Indices of input sequence tokens in the vocabulary.

            If `cache_params.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
            the complete sequence length.
z_The bare MAMBA Model transformer outputting raw hidden-states without any specific head on top.c                   P    e Zd Z fdZd Zd Zd Z ee           e	e
ee          	 	 	 	 	 	 	 	 ddeej                 deej                 d	ee         d
ee         dee         dee         deej                 deej                 deeef         fd                        Z xZS )
MambaModelc                    t                                                     t          j        j        j                  | _        t          j        fdt          j	                  D                       | _
        d| _        t          j        j                  | _        |                     | j                   |                                  d S )Nc                 2    g | ]}t          |           S )r   )r   ).0idxr#   s     rY   
<listcomp>z'MambaModel.__init__.<locals>.<listcomp>%  s&    $r$r$r3Z#%F%F%F$r$r$rrZ   Fr   )r1   r2   r	   r   
vocab_sizer3   
embeddings
ModuleListr   r   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_initrV   r#   rX   s    `rY   r2   zMambaModel.__init__!  s       ,v'8&:LMMm$r$r$r$rRWX^XpRqRq$r$r$rss&+#"6#56;TUUU//???rZ   c                 v    |D ]5}d|v r/|                     |          ||                    dd          <    d S 6d S )Nz
embedding.zembeddings.)popreplace)rV   
state_dictprefixargsks        rY   r  zMambaModel.load_hook-  sW     	 	Aq  EO^^TUEVEV
199\=AAB !	 	rZ   c                     | j         S Nr  r   s    rY   get_input_embeddingszMambaModel.get_input_embeddings3  s
    rZ   c                     || _         d S r  r  rV   new_embeddingss     rY   set_input_embeddingszMambaModel.set_input_embeddings6  s    (rZ   
checkpointoutput_typer   N	input_idsinputs_embedsr\   	use_cacheoutput_hidden_statesreturn_dictr]   r^   returnc	                 ,   ||n| j         j        }||n| j        s| j         j        nd}||n| j         j        }|d u |d uz  rt          d          ||                     |          }| j        r| j        r|rd}|rp|\t          | j         |	                    d          |j
        |j                  }t          j        d| j         j        |j
                  }n|t          d          nd }|}	|rdnd }
| j        D ]F}| j        r&| j        r|                     |j        |	|||          }	n ||	|||          }	|r|
|	fz   }
G|                     |	          }	|r|
|	fz   }
|st'          d	 |	||
fD                       S t)          |	|r|nd |

          S )NFz:You must specify exactly one of input_ids or inputs_embedsr   r   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyr   r   c              3      K   | ]}||V  	d S r  r   )r  vs     rY   	<genexpr>z%MambaModel.forward.<locals>.<genexpr>  s(      ffqXYXeXeXeXeXeffrZ   )r   r\   r[   )r#   r)  rg   r(  use_return_dict
ValueErrorr  r  r   rn   r   r/   rG   rH   r6   r  _gradient_checkpointing_func__call__r  tupler   )rV   r&  r'  r\   r(  r)  r*  r]   r^   r[   all_hidden_statesmixer_blocks               rY   r   zMambaModel.forward9  s_   $ %9$D  $+Jj 	 "+!6IIZ^Zg=rT[=R=Rmr	%0%<kk$+B]-t";< 	[YZZZ  OOI66M& 	4= 	Y 	I 	 #)K!3!3A!6!6}?S[h[n      "'a1HQ^Qe!f!f!f' !;  	 (  L%"6@BBD; 	I 	IK* 
t} 
 $ A A(-~We! ! !,!!-#1#1	! ! ! $ I$58H$H!M22 	E 1]4D D 	gff]LBS$Tffffff+)2<+
 
 
 	
rZ   )NNNNNNNN)r   r   r   r2   r  r  r"  r   MAMBA_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   rG   r   r   boolr   r   r   r   r   s   @rY   r  r    s       

 
 
 
 
    ) ) ) +*+ABB&$   1548-1$(/3&*5959J
 J
E,-J
   01J
 z*	J

 D>J
 'tnJ
 d^J
 !!12J
 !!12J
 
uk!	"J
 J
 J
  CBJ
 J
 J
 J
 J
rZ   r  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       e Zd ZdgZ fdZd Zd Zd Zd Z	 dde	d	e
eef         d
ede
eef         fdZ	 	 	 	 	 ddee         deej                 deej                 fdZ ee           eeee          	 	 	 	 	 	 	 	 	 ddeej                 deej                 deej                 dee         deej                 dee         dee         dee         deej                 deeef         fd                        Z xZ S )MambaForCausalLMzlm_head.weightc                     t                                          |           t          |          | _        t	          j        |j        |j        d          | _        | 	                                 d S )NFr-   )
r1   r2   r  r   r	   rB   r3   r
  lm_headr  r  s     rY   r2   zMambaForCausalLM.__init__  s^       "6**y!3V5FUSSSrZ   c                     | j         S r  r?  r   s    rY   get_output_embeddingsz&MambaForCausalLM.get_output_embeddings  s
    |rZ   c                     || _         d S r  rA  r   s     rY   set_output_embeddingsz&MambaForCausalLM.set_output_embeddings  s    %rZ   c                 4    | j                                         S r  )r   r  r   s    rY   r  z%MambaForCausalLM.get_input_embeddings  s    }11333rZ   c                 6    | j                             |          S r  )r   r"  r   s     rY   r"  z%MambaForCausalLM.set_input_embeddings  s    }11.AAArZ   r   outputsmodel_kwargsnum_new_tokensr+  c                 6   |                     dd           |d<   |                     dd          r"d|v r|d         |d         dd          |z   |d<   d|v rC|d         }t          j        ||                    |j        d         df          gd	          |d<   |S )
Nr\   r(  Tr]   r0   r^   r   r   rb   )getrG   catnew_onesrs   )rV   rG  rH  rI  kwargsr^   s         rY   #_update_model_kwargs_for_generationz4MambaForCausalLM._update_model_kwargs_for_generation  s     (/{{>4'H'H^$[$//	b L00-.:-9:J-KBCC-PSa-aL)*|++)*:;N-2Y!8!8.:Nq:QST9U!V!VW]_. . .L)* rZ   Nr\   r]   r^   c                 J   |rg|t          d          |d         dk    r$|d d df                             d          }|d }n&t          j        d| j        j        |j                  }||d|i}nd|                                i}|                    ||||d           |S )Nz`cache_position` should not be None as it should have been initialized in `model.generate`, you are responsible for passing in a valid `cache_position` if you are calling `prepare_inputs_for_generation` directly with `use_cache=True`r   r0   r-  r'  r&  )r\   r(  r]   r^   )	r2  rl   rG   rH   r#   r6   r   rK   update)	rV   r&  r'  r(  r\   r]   r^   rN  model_inputss	            rY   prepare_inputs_for_generationz.MambaForCausalLM.prepare_inputs_for_generation  s      	c% e  
 a 1$$%aaae,66r::	!-%)N "'a1HQZQa!b!b!b$)=+];LL')=)=)?)?@L ,&"0"0	 	
 	
 	
 rZ   r#  r&  r'  labelsr)  r*  r(  c
           
         ||n| j         j        }|                     |||||||	|          }|d         }|                     |                    | j        j        j                                                            }d}||                    |j                  }|dddddf         	                                }|dddf         	                                }t                      } ||                    d|                    d                    |                    d                    }|s|f|dd         z   }||f|z   n|S t          |||j        |j                  S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)r\   r'  r)  r*  r(  r]   r^   r   .r0   r   )r  r  r\   r[   )r#   r1  r   r?  r   rh   r/   ri   r   rK   r
   rm   rn   r   r\   r[   )rV   r&  r^   r'  r\   rT  r)  r*  r(  r]   rN  mamba_outputsr[   r  r  shift_logitsshift_labelsloss_fctoutputs                      rY   r   zMambaForCausalLM.forward  s   2 &1%<kk$+B]%'!5#)) & 	
 	
 &a(m..t|/B/HIIJJPPRRYYv}--F!#ssAAA+.99;;L!#qrr'?5577L'))H8L--b,2C2CB2G2GHH,J[J[\^J_J_``D 	FYqrr!22F)-)9TGf$$vE"&3'5	
 
 
 	
rZ   )r   )NNNNN)	NNNNNNNNN)!r   r   r   _tied_weights_keysr2   rB  rD  r  r"  r   r   strr   r9   rO  r   r   rG   r   rS  r   r8  r   r9  r   r:  r   r;  r   r   r   r   r   r   s   @rY   r=  r=    sM        ++      & & &4 4 4B B B YZ "26sCx.RU	c3h   , -15959. .
 z*. !!12. !!12. . . .` +*+ABB&'$   155959-1-1/3&*$(157
 7
E,-7
 !!127
   12	7

 z*7
 )*7
 'tn7
 d^7
 D>7
 !.7
 
u))	*7
 7
 7
  CB7
 7
 7
 7
 7
rZ   r=  )?r   r   dataclassesr   typingr   r   r   r   r   rG   torch.utils.checkpointr	   torch.nnr
   activationsr   cache_utilsr   
generationr   modeling_utilsr   utilsr   r   r   r   r   utils.import_utilsr   r   r   configuration_mambar   
get_loggerr   rS   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   causal_conv1dr   r    allrR   r9  r:  Moduler"   r   r   r   r   r   MAMBA_START_DOCSTRINGr8  r  r=  r   rZ   rY   <module>rp     sK      ! ! ! ! ! ! 4 4 4 4 4 4 4 4 4 4 4 4 4 4            % % % % % % ! ! ! ! ! ! % % % % % % ) ) ) ) ) ) - - - - - -              k j j j j j j j j j , , , , , , 
	H	%	% #######E QXXXXXXXXRRRRRRR@P=-~ 8DDDDDDDDD-7**.0@BVXfg   3 D^ D^ D^ D^ D^ D^ D^ D^NF F F F F29 F F F(       89F 9F 9F 9F 9F? 9F 9F 9Fx = = = = =+ = = =0 = = = = =+ = = =6   B e i
 i
 i
 i
 i
% i
 i
	 i
X   V
 V
 V
 V
 V
+_ V
 V
 V
 V
 V
rZ   