
    g0                    :   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZ d dl m!Z! d dl"Z"d dl#m$Z$ d d	l%m&Z& d d
l"m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE ddlFmGZGmHZH ddlImJZJ ddlKmLZL ddlMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZkmlZlmmZmmnZn ddlompZpmqZqmrZr ddlsmtZtmuZumvZvmwZw ddlxmyZymzZz e	j{        |                    dd          }                                Z~e	j{        |                    dd          }                                Z eb            rhd dlmZmZmZ d d lmZ d d!lmZmZmZmZmZmZmZmZmZ  e&j        ej        &                    d"                    Ze e&j        d#          k    rd d$lmZ  ei            rd d%lmZ d d&lmZ d d'lmZ  elj        e          Zd(ad) Zd* Z eu            r2d dlmc m"Z d d+lmZ  e&j        e           e&j        d,          k    Znd-Z eg            rdd.lMmZ e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        e(j        j        d/Zedld0            Zd1ee(j        d2f         fd3Zd1ee(j        d2f         fd4Zd1ee(j        d2f         fd5Zd6 Zd7 Zd8 Zdmd:Zd;eYfd<eee"j'        f         d=eeef         d>efd?Zdnd@Z	 	 	 dodAeee	j        f         dBedCeeee"j        f                  dDefdEZdF ZdGe"j'        dHefdIZdmdJe(j        fdKZdLeee                  d<eee"j'        f         dHeeee                  ee         f         fdMZdLeee                  d<eee"j'        f         dHeeee                  ee         f         fdNZdpdOZdP ZdQ Z	 	 	 	 	 	 	 	 	 	 	 dqdRZdrd>edSee         dHefdTZ G dU d2          Z G dV dWe(j        ee6e\e8          Z e^ej                  e_        ej        j        ,ej        j                            dXdYdZ[          ej        _         G d\ d]e(j                  Z G d^ d_e(j                  Z G d` dae(j                  Ze G db dce[                      Z G dd dee(j                  Z G df dge(j                  ZdpdXe(j        dhedHe(j        fdiZdj Zdk ZdS )s    N)contextmanager)	dataclass)	lru_cachepartialwraps)Thread)AnyCallableDictListOptionalSetTupleUnion)
is_zipfile)"split_torch_state_dict_into_shards)version)Tensornn)CrossEntropyLossIdentity)
checkpoint   )get_activation)PretrainedConfig)custom_object_save)GenerationConfigGenerationMixin)PeftAdapterMixindeepspeed_configis_deepspeed_zero3_enabled)LOSS_MAPPING)Conv1Dapply_chunking_to_forward find_pruneable_heads_and_indicesid_tensor_storage#is_torch_greater_or_equal_than_1_13prune_conv1d_layerprune_layerprune_linear_layer)AutoHfQuantizerHfQuantizer)get_module_from_name)auto_conversion)!ACCELERATE_MIN_VERSIONADAPTER_SAFE_WEIGHTS_NAMEADAPTER_WEIGHTS_NAMECONFIG_NAMEDUMMY_INPUTSFLAX_WEIGHTS_NAMESAFE_WEIGHTS_INDEX_NAMESAFE_WEIGHTS_NAMETF2_WEIGHTS_NAMETF_WEIGHTS_NAMEWEIGHTS_INDEX_NAMEWEIGHTS_NAMEContextManagersModelOutputPushToHubMixincached_file	copy_funcdownload_urlextract_commit_hashhas_fileis_accelerate_availableis_bitsandbytes_availableis_flash_attn_2_availableis_offline_modeis_optimum_availableis_peft_availableis_remote_urlis_safetensors_availableis_torch_sdpa_availableis_torch_xla_availableloggingreplace_return_docstrings	strtobool)convert_file_size_to_intcreate_and_tag_model_cardget_checkpoint_shard_files)ENV_VARS_TRUE_VALUESis_sagemaker_mp_enabledis_torch_fx_proxyis_torchdynamo_compiling)BitsAndBytesConfigQuantizationMethodXLA_USE_BF160XLA_DOWNCAST_BF16)dispatch_modelinfer_auto_device_mapinit_empty_weights)add_hook_to_module)	$check_tied_parameters_on_same_deviceextract_model_from_parallelfind_tied_parametersget_balanced_memoryget_max_memoryload_offloaded_weightsoffload_weightsave_offload_indexset_module_tensor_to_device
accelerate0.31)get_state_dict_from_offload)	safe_open)	load_file)	save_fileTc                  >   t           j                                        ot           j                                        oat	          t
          j                            dd                    dk    o0t	          t
          j                            dd                    dk    S )NACCELERATE_USE_FSDPFalser   FSDP_CPU_RAM_EFFICIENT_LOADING)torchdistributedis_availableis_initializedrO   osenvironget     W/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/modeling_utils.pyis_fsdp_enabledr}      s    &&(( 	V,,..	Vbjnn%:GDDEEJ	V bjnn%EwOOPPTUU	r{   c                      t           j                                        oNt           j                                        o0t	          t
          j                            dd                    dk    S )N
LOCAL_RANKr   )rs   rt   ru   rv   intrw   rx   ry   rz   r{   r|   is_local_dist_rank_0r      sU    &&(( 	7,,..	7
|R0011Q6r{   __version__z1.10F)find_adapter_config_file)uniform_normal_trunc_normal_	constant_xavier_uniform_xavier_normal_kaiming_uniform_kaiming_normal_uniformnormalxavier_uniformxavier_normalkaiming_uniformkaiming_normalc              #     K   t           }| rDda d }t                                          D ]%\  }}t          t          j        j        ||           &	 dV  |a | r?t                                          D ]'\  }}t          t          j        j        ||           &dS dS # |a | r?t                                          D ]&\  }}t          t          j        j        ||           &w w xY w)z
    Context manager to globally disable weight initialization to speed up loading large models.

    TODO(Patrick): Delete safety argument `_enable=True` at next major version. .
    Fc                      d S Nrz   )argskwargss     r|   
_skip_initz#no_init_weights.<locals>._skip_init   s    Dr{   N)_init_weightsTORCH_INIT_FUNCTIONSitemssetattrrs   r   init)_enableold_init_weightsr   name	init_funcs        r|   no_init_weightsr      s       % 5	 	 	  499;; 	5 	5OD)EHM444448( 	8#7#=#=#?#? 8 8itY7777	8 	88 8 ) 	8#7#=#=#?#? 8 8itY7777	88s   B AC"	parameterModuleUtilsMixinc                 <   	 t          |                                           j        S # t          $ ri dt          j        dt          t          t          t          f                  fd}| 
                    |          }t          |          }|d         j        cY S w xY w)Nmodulereturnc                 L    d | j                                         D             }|S )Nc                 D    g | ]\  }}t          j        |          ||fS rz   rs   	is_tensor.0kvs      r|   
<listcomp>zHget_parameter_device.<locals>.find_tensor_attributes.<locals>.<listcomp>   0    WWWAEOTUDVDVWq!fWWWr{   __dict__r   r   tupless     r|   find_tensor_attributesz4get_parameter_device.<locals>.find_tensor_attributes   (    WW)>)>)@)@WWWFMr{   get_members_fnr   )next
parametersdeviceStopIterationr   Moduler   r   strr   _named_membersr   r   genfirst_tuples       r|   get_parameter_devicer      s    %I((**++22 	% 	% 	%	29 	eCK>P9Q 	 	 	 	 &&6L&MM3ii1~$$$$	%   %( A0BBc                 <   	 t          |                                           j        S # t          $ ri dt          j        dt          t          t          t          f                  fd}| 
                    |          }t          |          }|d         j        cY S w xY w)z`
    Returns the first parameter dtype (can be non-floating) or asserts if none were found.
    r   r   c                 L    d | j                                         D             }|S )Nc                 D    g | ]\  }}t          j        |          ||fS rz   r   r   s      r|   r   zMget_first_parameter_dtype.<locals>.find_tensor_attributes.<locals>.<listcomp>   r   r{   r   r   s     r|   r   z9get_first_parameter_dtype.<locals>.find_tensor_attributes   r   r{   r   r   )r   r   dtyper   r   r   r   r   r   r   r   r   s       r|   get_first_parameter_dtyper      s    $I((**++11 	$ 	$ 	$	29 	eCK>P9Q 	 	 	 	 &&6L&MM3ii1~####	$r   c                 ,   d}|                                  D ]}|j        }|                                rt          t          v rt                      rt          j        c S t          t          v rTt                      rF|j        t          j	        k    rt          j        c S |j        t          j
        k    rt          j        c S |j        c S ||S dt          j        dt          t          t           t"          f                  fd}|                     |          }d}|D ]-}|}|d                                         r|d         j        c S .||d         j        S |                                 D ]&}|j        }|                                r	|j        c S '|S )zz
    Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found.
    Nr   r   c                 L    d | j                                         D             }|S )Nc                 D    g | ]\  }}t          j        |          ||fS rz   r   r   s      r|   r   zGget_parameter_dtype.<locals>.find_tensor_attributes.<locals>.<listcomp>  s0    SSSTQPQ@R@RS1a&SSSr{   r   r   s     r|   r   z3get_parameter_dtype.<locals>.find_tensor_attributes  s(    SSV_%:%:%<%<SSSr{   r   r   )r   r   is_floating_pointrY   rS   rL   rs   bfloat16r[   floatdoublefloat32r   r   r   r   r   r   r   buffers)r   
last_dtypetr   r   
last_tupletuples          r|   get_parameter_dtyper      s    J!!##  W
   	
 3338N8P8P3~%%% $888=S=U=U87ek)) >)))7el** =(((7NNN	 ry T%V:L5M     
"
"2H
"
I
ICJ " "
8%%'' 	"8>!!!	" !}""     W
   	7NNN	r{   c                     |                                  D ]}|                                r	|j        c S  t          d          )z_
    Returns the first found floating dtype in `state_dict` or asserts if none were found.
    z5couldn't find any floating point dtypes in state_dict)valuesr   r   
ValueError
state_dictr   s     r|   get_state_dict_float_dtyper   *  sT            	7NNN	 L
M
MMr{   c                     |                                  D ]}|                                r	|j        c S  t          |                                            j        S )zt
    Returns the first found floating dtype in `state_dict` if there is one, otherwise returns the first dtype.
    )r   r   r   r   r   s     r|   get_state_dict_dtyper   5  sb        / /   	7NNN	
 J%%''((..r{   c                     | t           j        k    rdS t          j        dt	          |                     }|t          d|  d          t          |                                d                   }|dz  S )z
    Returns the size (in bytes) occupied by one parameter of type `dtype`.

    Example:

    ```py
    >>> dtype_byte_size(torch.float32)
    4
    ```
    g      ?z[^\d](\d+)_?Nz`dtype` is not a valid dtype: .r      )rs   boolresearchr   r   r   groups)r   
bit_searchbit_sizes      r|   dtype_byte_sizer   B  su     
u?CJJ77JB%BBBCCC:$$&&q)**Hq=r{    c                    t          fd|D                       dk    rdS t                      rdS t          | dd          s)t                              | j        j         d           dS t          |                                 	                                          d         }|z   |v r1||z            j
        |                                 |         j
        k    S dS )a>  
    Checks if `model_to_load` supports param buffer assignment (such
    as when loading in empty weights) by first checking
    if the model explicitly disables it, then by ensuring that the state dict keys
    are a subset of the model's parameters.

    Note: We fully disable this if we are using `deepspeed`
    c                 >    g | ]}|                               |S rz   
startswith)r   keystart_prefixs     r|   r   z9check_support_param_buffer_assignment.<locals>.<listcomp>_  s*    FFFC)E)EFCFFFr{   r   F!_supports_param_buffer_assignmentTzA does not support param buffer assignment, loading will be slower)lenr!   getattrloggerdebug	__class____name__listr   keysr   )model_to_loadr   r   	first_keys     ` r|   %check_support_param_buffer_assignmentr   V  s     FFFF:FFFGG1LLu!## u ="EtLL &/rrr	
 	
 	
 u ]--//446677:Ii:--,239]=U=U=W=WXa=b=hhh 5r{   10GBr   max_shard_sizeweights_namec           
         t                               d           t          |          }i g}d}d}i }|                                 D ]\  }}t	          |t
                    rt          |          }	|	|v r1|j        t          j        d          k    r||	         }
|||
         |<   _|	                                t          |j                  z  }||z   |k    r0t          |d                   dk    r|                    i            d}||d         |<   ||z  }||z  }t          |          dz
  ||	<   t          |          dk    r||d         idfS i }i }t          |          D ]\  }}|                    dd|dz   d	d
t          |          d	d          }|                    dd|dz   d	d
t          |          d	d          }|||<   |                                D ]}|||<   d|i}||d}||fS )a  
    Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
    given size.

    The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
    optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
    limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
    [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].

    <Tip warning={true}>

    If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will
    have a size greater than `max_shard_size`.

    </Tip>

    Args:
        state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save.
        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
            The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
            (like `"5MB"`).
        weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`):
            The name of the model save file.
    zNote that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using split_torch_state_dict_into_shards from huggingface_hub libraryr   metar   r   N.bin-05dz-of-.safetensors
total_sizemetadata
weight_map)r   warningrP   r   
isinstancer   r&   r   rs   numelr   r   r   append	enumeratereplacer   )r   r   r  sharded_state_dictslast_block_sizer  storage_id_to_blockr   weight
storage_idblock_idweight_sizer  shardsidxshard
shard_filer
  indexs                      r|   shard_checkpointr  u  s   6 NN	J   .n==N$OJ!'')) G GV fc"" 	3*622J ,,,%,vBVBV1V1V*:6H17)#.llnnv|'D'DD [(>99cBUVXBY>Z>Z]^>^>^&&r***O'-B$;&k!
*-.A*B*BQ*FJ'' 1$$1!45t;; JF 344 ) )
U!))&2gc!e2g2g2gSI\E]E]2g2g2g2ghh
''[a[[[5H1I1I[[[[
 

 #z::<< 	) 	)C(JsOO	) j)H!<<E5=r{   c                    t           j                            |t                    }t           j                            |t                    }t           j                            |          }t           j                            |          }|s]|rt                      sMt                      rt          t          fnt          f}t          dd                    |           d| d          d}	|r6|r0t                      rd}	n#t          	                    d| d           n|sd}	|	r|n|}
t          |
d	d
          5 }t          j        |          }ddd           n# 1 swxY w Y   t          t          |d                                                             }|d                                         |                                                                 fdD             }fdD             }|rt%          |          dk    st%          |          dk    rd| j        j         }t%          |          dk    r(d                    d |D                       }|d| dz  }t%          |          dk    r(d                    d |D                       }|d| dz  }t+          |          t,          rddini }|	rt.          nt1          t2          j        fddi|}|D ]V} |t           j                            ||                    }|                     |d           ~t7          j                     Wt2          j        j        j                             ||          S )a  
    This is the same as
    [`torch.nn.Module.load_state_dict`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict)
    but for a sharded checkpoint.

    This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
    loaded in the model.

    Args:
        model (`torch.nn.Module`): The model in which to load the checkpoint.
        folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint.
        strict (`bool`, *optional`, defaults to `True`):
            Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
        prefer_safe (`bool`, *optional*, defaults to `False`)
            If both safetensors and PyTorch save files are present in checkpoint and `prefer_safe` is True, the
            safetensors files will be loaded. Otherwise, PyTorch files are always loaded when possible.

    Returns:
        `NamedTuple`: A named tuple with `missing_keys` and `unexpected_keys` fields
            - `missing_keys` is a list of str containing the missing keys
            - `unexpected_keys` is a list of str containing the unexpected keys
    zCan't find a checkpoint index ( or z) in r   FTz"Cannot load sharded checkpoint at z+ safely since safetensors is not installed!rutf-8encodingNr  c                     g | ]}|v|	S rz   rz   )r   r   loaded_keyss     r|   r   z+load_sharded_checkpoint.<locals>.<listcomp>  s#    HHHCK1G1GC1G1G1Gr{   c                     g | ]}|v|	S rz   rz   )r   r   
model_keyss     r|   r   z+load_sharded_checkpoint.<locals>.<listcomp>  s#    KKKsS
5J5Js5J5J5Jr{   r   #Error(s) in loading state_dict for ,c                     g | ]}d | d 	S "rz   r   r   s     r|   r   z+load_sharded_checkpoint.<locals>.<listcomp>
  s     (H(H(HaQ(H(H(Hr{   z
Missing key(s): c                     g | ]}d | d 	S r,  rz   r.  s     r|   r   z+load_sharded_checkpoint.<locals>.<listcomp>  s     +N+N+NHHHH+N+N+Nr{   weights_onlymap_locationcpu)strict)!rw   pathjoinr9   r5   isfilerJ   r   r   r  openjsonloadr   setr   r   r   r   r   r   RuntimeErrorr'   safe_load_filer   rs   load_state_dictgccollectr   modulesr   _IncompatibleKeys)modelfolderr3  prefer_safe
index_filesafe_index_fileindex_presentsafe_index_present	filenames	load_safe
load_indexfr  shard_filesmissing_keysunexpected_keyserror_messagestr_missing_keysstr_unexpected_keysweights_only_kwargloaderr  r   r&  r(  s                          @@r|   load_sharded_checkpointrU    s   0 f&899Jgll6+BCCOGNN:..M88 c"4 c9Q9S9S c=U=W=Wr!899^p]r 	 a6;;y;Q;QaaX^aaabbbI 	 	'))  		llll     	I$-=:J	j#	0	0	0 A	!               s5.55778899K %**,,K!!##((**JHHHH:HHHLKKKKkKKKO *3|$$q((C,@,@1,D,DXeo>VXX|q  "xx(H(H<(H(H(HIIE2BEEEEM!##"%((+N+No+N+N+N"O"OH2EHHHHM=)))3V^.$//\^(k^^gej.k.ku.kXj.k.kF!  
VBGLL<<==
j777 

 8"44\?SSSs   <EE!$E!checkpoint_fileis_quantizedr1  r0  c           	      b   |                      d          rt                      rvt          | d          5 }|                                }ddd           n# 1 swxY w Y   |                    d          dvrt          d|  d          t          |           S 	 |qt                      r@t          j	        
                                r"t          j	                                        d	k    st                      rt                      s|sd
}nd}i }t          | t                    rM|d
k    rGt!          j        t          j                  t!          j        d          k    rt'          |           rddi}t(          rd|ini }t          j        | fd|i||S # t,          $ r}	 t/          |           5 }|                    d          dk    rt          d          t3          d|  d          |# 1 swxY w Y   n+# t4          t2          f$ r t          d|  d|  d          w xY wY d}~dS d}~ww xY w)z]
    Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
    r  pt	frameworkNformat)rY  tfflaxmlxz"The safetensors archive passed at zf does not contain the valid metadata. Make sure you save your model with the `save_pretrained` method.r   r  r2  2.1.0mmapTr0  r1     r   zYou seem to have cloned a repository without having git-lfs installed. Please install git-lfs and run `git lfs install` followed by `git lfs pull` in the folder you cloned.zUnable to locate the file z_ which is necessary to load this pretrained model. Make sure you have saved the model properly.z9Unable to load weights from pytorch checkpoint file for 'z' at 'zZ'. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True.)endswithrJ   rl   r
  ry   OSErrorr<  r!   rs   rt   rv   get_rankr}   r   r  r   r   parser   r   r'   r9  	Exceptionr7  readr   UnicodeDecodeError)	rV  rW  r1  r0  rL  r
  
extra_argsrS  es	            r|   r=  r=     sd    // 	/4L4N4N 	/$777 	$1zz||H	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$<<!!)DDDI_ I I I   o...0 /00
% )88::
% )2244q88#%% 9.B.D.D 9 # 9  &$
 ,,	(&&e/00GM'4J4JJJ?++ K !$J?bjnl;;hjz
 
%
 !
 	
 
 	
    	o&& !66!99	))!&   %N_ N N N           #J/ 	 	 	jO j j&j j j  	     sZ   AAAC9F 
H. G8/=G,,G0	0G83G0	4G87H)8(H  H))H.c                     i }|                                  D ]\\  }fd|D             }dk    rt          |          }|                    |                                          rd|_        W||<   ]|S )z
    Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state
    dict.
    c                 r    h | ]3}|                      d           |                     d d          4S r   r   )r   r  )r   r   module_names     r|   	<setcomp>z-set_initialized_submodules.<locals>.<setcomp>m  sN    tttATUT`T`doarararTsTstqyyK!2!2!2B77tttr{   r   T)named_modulesr:  
issupersetr   _is_hf_initialized)rB  state_dict_keysnot_initialized_submodulesr   r&  ro  s        @r|   set_initialized_submodulesrv  f  s    
 "$$2244 = =Vttttttt"o..K!!&"3"3"5"566 	=(,F%%6<&{33%%r{   tensorr   c                     |                                  rC|                     d          d                                         |                                 z   }n|                                 }|S )Nr   )nelementviewdata_ptrelement_size)rw  stops     r|   _end_ptrr~  x  s[     !{{2r"++--0C0C0E0EE  Kr{   r   c                    g }t          | dd           (fd| j        D             }|                    |           t          | dd           (fd| j        D             }|                    |           |                                 D ]4\  }}r d| n|}|                    t          ||                     5|S )N_tied_weights_keysc                 &    g | ]}r d | n|S r   rz   r   r   prefixs     r|   r   z)_get_tied_weight_keys.<locals>.<listcomp>  s,    UUUaF1FQUUUr{   _dynamic_tied_weights_keysc                 &    g | ]}r d | n|S r  rz   r  s     r|   r   z)_get_tied_weight_keys.<locals>.<listcomp>  s,    ]]]aF1FQ]]]r{   r   )r  )r   r  extendr  named_children_get_tied_weight_keys)r   r  tied_weight_keysnamesr   	submodulelocal_prefixs    `     r|   r  r    s    v+T22>UUUU6;TUUU&&&v3T::F]]]]6;\]]]&&&!0022 W Wi-3=&))4))) 5i U U UVVVVr{   tensorsc                    g }| D ]}t          |          dk     r|                    |           +g }|D ]A}||         }|                    |                                t          |          |f           B|                                 |d         \  }}}	|                    |	h           |dd          D ]@\  }
}}|
|k    r|                    |h           n|d                             |           |}Ag }g }|D ]R} t          |           dk    r(|                    |                                            =|                    |            S||fS )N   r   r   r   )r   r  r{  r~  sortaddpop)r  r   filtered_tensorssharedareasr   rw  _	last_stop	last_namestartr}  disjoint_tensorsshared_tensorss                 r|   _find_disjointr    s     v;;??##F+++ 	F 	FD%FLL&//++Xf-=-=tDEEEE

"'(9i,,,!&qrr 	 	E4	!! ''//// $((...II	 N# + +w<<1##GKKMM2222!!'****+++r{   c                    g }g }| D ]}t          |          dk     rt          j        t                    }|D ]N}||         }|j        |                                t          |          f}||                             |           Ot          |          dk    r|                    |           |                    |           ||fS )Nr  r   )	r   collectionsdefaultdictr:  r   r{  r~  r  r  )	r  r   r  	identicalr  r  r   rw  areas	            r|   _find_identicalr    s    NI * *v;;??',, 	" 	"D%FM6??#4#4hv6F6FGD$KOOD!!!!u::??V$$$$!!&))))9$$r{   c                     g }g }i }i }i }d| j         j         d}	|                                D ]v}
d }d|
v r|
                    dd          }|s|n|||
<   d|
v r|
                    dd          }|s|n|||
<   |r*|                    |
           |                    |           wi ||}|rJ|	dz  }	|                                D ]\  }}|	d| d	| d
z  }	|	dz  }	t                              |	           t          ||          D ]\  }}|	                    |          ||<   t          |dd           |                                }|_        g ddt          j        ffd | |||           ~S )NzA pretrained model of type `z` gammar  betabiasrcontains parameters that have been renamed internally (a few are listed below but more are present in the model):
* `` -> ``
nIf you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users.	_metadatar   Fr   c                   
 i n                     d d         i           }||d<   ||dg g f}t          fd|D                       dk    rt                      rdd l}t	          |                     d d         d                    

fd|                                D             }t          |          dk    r`|j                            |d	          5  t          j
                                        dk    r
 | j        |  d d d            n# 1 swxY w Y   n
 | j        |  | j                                        D ]\  }}	|	 |	||z   d
z   |           d S )Nr   assign_to_params_buffersTc                 >    g | ]}|                               |S rz   r   r   r   r  s     r|   r   z=_load_state_dict_into_model.<locals>.load.<locals>.<listcomp>  s*    DDDS^^F-C-CDDDDr{   r   F)r  recursec                 (    g | ]}|v |         S rz   rz   )r   r   named_parameterss     r|   r   z=_load_state_dict_into_model.<locals>.load.<locals>.<listcomp>  s+    #l#l#lAVW[kVkVk$4Q$7VkVkVkr{   modifier_rankr   )ry   r   r!   	deepspeeddictr  r   zeroGatheredParametersrs   rt   re  _load_from_state_dict_modulesr   )r   r   r  r  local_metadatar   r  params_to_gatherr   childr  
error_msgsr9  r
  s     `       @r|   r9  z)_load_state_dict_into_model.<locals>.load  s   '/X\\&"+r5R5R5M12FND"b*M DDDDzDDDEEII)++ 4     $((?(?vcrc{\a(?(b(b#c#c #l#l#l#lARAR#l#l#l '((1,, #::;K[\:]] @ @ ,55771<<8F8$??@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ -,d33!?0022 	W 	WKD% UJ(;=UVVV	W 	Ws    -DD D)r  r  )r   F)r   r   r   r  r  r   r   	info_oncezipr  r   copyr  r   r   )r   r   r   r  old_keysnew_keysrenamed_keysrenamed_gammarenamed_betawarning_msgr   new_keyold_keyr  r9  r
  s                @@@r|   _load_state_dict_into_modelr    s=   HHLMLU1H1QUUUK   % %c>>kk'844G0=!P=M#S==kk&&11G/; ML 	%OOC   OOG$$$4m4|4L &  M  	M , 2 2 4 4 	= 	=GW<<<<<<<KK  H  	H%%%(33 6 6(nnW55
7 z;55H""J'
JW WRY W W W W W W W W: 	D
<Rjkkkk 	r{   c                    t          |          dk    rE|                    |          r0d                    |                    d          dd                   }|                    d          }| }t          |          dk    rFt	          ||d                   rt          ||d                   }|d= nd}nt          |          dk    F|| k    rd}||d         fS )z
    A helper util to find the last sub-module and the param/buffer name. If `start_prefix` is supplied it'll be removed
    from the start of the key
    r   r   r   N)r   r   r5  splithasattrr   )rB  long_keyr   	split_keyr  s        r|   find_submodule_and_param_namer    s     <1!4!4\!B!B88HNN3//344s##II
i..1

9il++ 		9Q<88I!I i..1

 E	il""r{   c                 L   |D ]}t          | ||          \  }}|t          ||          }t          |t          j        j                  r3t          j                            |                    d                    }n|                    d          }t          |||           dS )z
    Moves `loaded_state_dict_keys` in model to meta device which frees up the memory taken by those params.

    `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in
    `bert.pooler.dense.weight`

    Nr  )r  r   r  rs   r   	Parametertor   )rB  loaded_state_dict_keysr   r   r  
param_namenew_vals          r|   _move_model_to_metar  &  s     $ 4 4 =eQ U U	:  i44G'58#566 -(,,WZZ-?-?@@!**V,,Iz73334 4r{   c           	      
  ' g }g }g }i }i }|
du}dt          |            }|                                D ]}d}d|v r|                    dd          }|s|n|||<   d|v r|                    dd          }|s|n|||<   t          t          j        j        d          r5d|v r|                    dd	          }d
|v r|                    d
d          }n4d	|v r|                    d	d          }d|v r|                    dd
          }|r*|                    |           |                    |           i ||}|rJ|dz  }|                                D ]\  }}|d| d| dz  }|dz  }t          
                    |           t          ||          D ]\  }}|                    |          ||<   t          t          d          }|                                D ]\  '}'|vr'                    |          r't          |          d         ''}i }|o|j        t          j        k    }|	t          j        |          r|s|t'          'fd|D                       rl|	t          j        k    r\|                    t          j                  }dt/          t1          j        t4                    j                  v rt          j        |d<   n|                    |	          }| }'                    d          } | D ]C}!t;          ||!          }t=          |t          j        j        t          j         f          sd}| nD|D|	|                    |j                  }|!                                r|"                                }||d<   |d}"nt          |          dk    rK||vrGd#                    |                    d          dd                   }t          |          dk    r||vG|dk    rd|vrtI          ' d          ||         }"|"dk    r|stK          |'||          }S|"dk    r|tK          |'||          }o|r"|
j&        r|
'                    | |'||"|          s1tQ                      rtS                      rdnd}"t5          | '|"fi | |
*                    | |'|"||           tQ                      stW                      rtY          | '          \  }#}$t;          |#|$          }%d}&tQ                      rtS                      sd}& t          |%          |%j-                            |&          fi |%j.        }%t_          |#|$|%           |||fS )a  
    This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its
    params on a `meta` device. It replaces the model params with the data from the `state_dict`, while moving the
    params back to the normal device, but only for `loaded_state_dict_keys`.

    `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in
    `bert.pooler.dense.weight`

    NzThis model r  r  r  r  weight_normweight_g!parametrizations.weight.original0weight_v!parametrizations.weight.original1r  r  r  r  r  float8_e4m3fnc              3   F   K   | ]}|                     d           v V  dS r   Nr  )r   module_to_keep_in_fp32r  s     r|   	<genexpr>z3_load_state_dict_into_meta_model.<locals>.<genexpr>  sH        H^*j.>.>s.C.CC     r{   r   r   valuer2  r   r   r   z doesn't have any device set.disk)param_device
device_mapr  )0typer   r  r  r   utilsparametrizationsr  r   r   r  r  r  rs   r   r   r   r  r   anyfloat16r  r   r   inspect	signaturerh   r   r  r   r  r  r   is_contiguous
contiguousr5  r   rf    requires_parameters_quantizationcheck_quantized_paramr}   r   create_quantized_paramr!   r-   datar   r   )(rB  r   r   expected_keysr  offload_folderoffload_indexstate_dict_folderstate_dict_indexr   hf_quantizeris_safetensorskeep_in_fp32_modulesrO  pretrained_model_name_or_pathr  r  r  r  r  rW  r  r   r  r  r  is_torch_e4m3fn_availableparamro  set_module_kwargsis_param_float8_e4m3fn	old_paramsplitsr  r  r   tensor_namer  param_tor  s(                                          @r|    _load_state_dict_into_meta_modelr  @  s   @ JHHMLt+L-U--K   % %c>>kk'844G0=!P=M#S==kk&&11G/; ML 28,m<< 		WS  ++j2UVVS  ++j2UVV2c99++&I:VV2c99++&I:VV 	%OOC   OOG$$$4m4|4L &  M  	M , 2 2 4 4 	= 	=GW<<<<<<<KK  H  	H%%%(33 6 6(nnW55
7 ' ? ?'--// `4 `4
E]**  .. 	9#C$5$5$7$78J  ";!au{eNa?a!8!?!?H^$0    bv     1 U]**// d7#45P#Q#Q#\]]]]16%g. 	!!#&& 	 	E	511Ii%(*<el)KLL ! 	  !  }11&&(( +((**%*'" LL k""Q&&;j+H+H!hh{'8'8'='=crc'BCC k""Q&&;j+H+Hb  Rz%9%9 J!M!M!MNNN%k2L6!!! a .uj.R_ ` `U""'7'C-eZARTdee	4 A	4 !665*j|`j 7  		4    K(<(>(>JuuF (z<]]K\]]]]//uj,Xbdsttt    4$>$@$@ 4&:5*&M&M#44 "$$ &-A-C-C &%H#UEJMM($;$;NNu~NNU333 }&666r{   variantc                     |C|                      d          }|d d         |gz   |dd          z   }d                    |          } | S )Nr   r   )r  r5  )r  r	  r  s      r|   _add_variantr    sS    ##C((y(6"##;6xx''r{   c                      e Zd ZdZed             Zed             Zd Zd Ze	de
j        fd            Ze	de
j        fd            Zd	edefd
Zedd            Z	 d dedee         de
j        de
j        def
dZ	 d!dee         dededefdZd Zd"dededefdZdeeee
j        ef         f         defdZ	 d#deeee
j        ef         f         dedefdZdS )$r   zH
    A few utilities for `torch.nn.Modules`, to be used as a mixin.
    c                     	 dd l }n# t          $ r t          d          w xY w|                    t          j                              }|                                }|j        | _        d S )Nr   FYou need to install psutil (pip install psutil) to use memory tracing.)psutilImportErrorProcessrw   getpidmemory_inforssmem_rss_pre_forward)r   r   r   r  processmems         r|   _hook_rss_memory_pre_forwardz-ModuleUtilsMixin._hook_rss_memory_pre_forward  sz    	hMMMM 	h 	h 	hfggg	h ..--!!##%(W"t    !c                 8   	 dd l }n# t          $ r t          d          w xY w|                    t          j                              }|                                }|j        | _        | j        | j        z
  }|t          | d          r| j
        ndz   | _
        d S )Nr   r  mem_rss_diff)r  r  r  rw   r  r  r  mem_rss_post_forwardr  r  r  )r   r   r   r  r  r  r  s          r|   _hook_rss_memory_post_forwardz.ModuleUtilsMixin._hook_rss_memory_post_forward  s    	hMMMM 	h 	h 	hfggg	h ..--!!##&)g#2V5OO*WVUcEdEd.kf.A.Ajkltr  c                     |                                  D ]6}|                    | j                   |                    | j                   7|                                  dS )a%  
        Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.

        Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero
        with `model.reset_memory_hooks_state()`.
        N)r@  register_forward_pre_hookr  register_forward_hookr  reset_memory_hooks_stateselfr   s     r|   add_memory_hooksz!ModuleUtilsMixin.add_memory_hooks  sf     llnn 	M 	MF,,T-NOOO(()KLLLL%%'''''r{   c                 ^    |                                  D ]}d|_        d|_        d|_        dS )z
        Reset the `mem_rss_diff` attribute of each module (see [`~modeling_utils.ModuleUtilsMixin.add_memory_hooks`]).
        r   N)r@  r  r  r  r"  s     r|   r!  z)ModuleUtilsMixin.reset_memory_hooks_state'  s@     llnn 	+ 	+F"#F*+F')*F&&	+ 	+r{   r   c                      t          |           S )z
        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
        device).
        )r   r#  s    r|   r   zModuleUtilsMixin.device0  s     $D)))r{   c                      t          |           S )zw
        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
        )r   r'  s    r|   r   zModuleUtilsMixin.dtype8  s    
 #4(((r{   encoder_attention_maskc                 (   |                                 dk    r|dddddddf         }|                                 dk    r|ddddddf         }|                    | j                  }d|z
  t          j        | j                  j        z  }|S )z
        Invert an attention mask (e.g., switches 0. and 1.).

        Args:
            encoder_attention_mask (`torch.Tensor`): An attention mask.

        Returns:
            `torch.Tensor`: The inverted attention mask.
           Nr  r         ?)dimr  r   rs   finfomin)r#  r)  encoder_extended_attention_masks      r|   invert_attention_maskz&ModuleUtilsMixin.invert_attention_mask?  s     "%%''1,,.DQQQaaaQRQRQR].S+!%%''1,,.DQQQdTUTUTUEU.V+ +J*L*LSWS]*L*^*^'+.1P+PTYT_`d`jTkTkTo*o'..r{   Nc                 8   |t          j        dt                     n|j        }| \  }}t	          j        ||          }|d d d d f                             ||d          |d d d d f         k    }|                    |j                  }|j	        d         |j	        d         k     rP|j	        d         |j	        d         z
  }t	          j
        t	          j        |||f||j                  |gd          }|d d d d d d d f         |d d d d d d f         z  }|S )NNThe `device` argument is deprecated and will be removed in v5 of Transformers.)r   r   r   r   r   axis)warningswarnFutureWarningr   rs   arangerepeatr  r   shapecatones)	input_shapeattention_maskr   
batch_size
seq_lengthseq_idscausal_maskprefix_seq_lenextended_attention_masks	            r|   *create_extended_attention_mask_for_decoderz;ModuleUtilsMixin.create_extended_attention_mask_for_decoderW  se   M`bo    $*F!,
J,z&999dD!!!m,33J
ANNRYZ^`a`a`acgZgRhh "nn^%9::Q."6q"999+1!4{7H7KKN)J
JGPV^i^oppp   K #.aaaqqq!!!m"<~aaaQUW[]^]^]^N^?_"_&&r{   rA  r@  r   r   c                 2   || j         }|                                dk    r| j        j        s|t	          j        dt                     |                                dk    r|dddddddf         }nm|                                dk    r:| j        j        rt                              |||          }n,|ddddddf         }nt          d| d|j
         d          |                    |          }d	|z
  t          j        |          j        z  }|S )
a  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`Tuple[int]`):
                The shape of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        Nr  r4  r+  z!Wrong shape for input_ids (shape z) or attention_mask (shape )r,  r-  )r   r.  config
is_decoderr8  r9  r:  r   rH  r   r=  r  rs   r/  r0  )r#  rA  r@  r   r   rG  s         r|   get_extended_attention_maskz,ModuleUtilsMixin.get_extended_attention_masks  s^    =JE""$$))dk.D)!dfs  
 1$$&4QQQaaa]&C##!!Q&& {% K*:*e*e+ +'' +9D$9I*J''sKss\j\psss   #:"<"<5"<"I"I#&)@#@EKPUDVDVDZ"Z&&r{   F	head_masknum_hidden_layersis_attention_chunkedc                 v    |0|                      ||          }|du r|                    d          }ndg|z  }|S )a  
        Prepare the head mask if needed.

        Args:
            head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
            num_hidden_layers (`int`):
                The number of hidden layers in the model.
            is_attention_chunked (`bool`, *optional*, defaults to `False`):
                Whether or not the attentions scores are computed by chunks or not.

        Returns:
            `torch.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
            `[None]` for each layer.
        NTr   )_convert_head_mask_to_5d	unsqueeze)r#  rN  rO  rP  s       r|   get_head_maskzModuleUtilsMixin.get_head_mask  sR    $  55iARSSI#t++%//33	!22Ir{   c                 P   |                                 dk    rh|                    d                              d                              d                              d          }|                    |dddd          }nS|                                 dk    r;|                    d                              d                              d          }|                                 dk    sJ d|                                              |                    | j                  }|S )zD-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]r   r   r   r     zhead_mask.dim != 5, instead r,  )r.  rS  expandr  r   )r#  rN  rO  s      r|   rR  z)ModuleUtilsMixin._convert_head_mask_to_5d  s    ==??a!++A..88;;EEbIISSTVWWI!(():BBKKII]]__!!!++A..88<<FFrJJI}}!###%UIMMOO%U%U###LLtzL22	r{   only_trainableexclude_embeddingsc                   	 |r?d |                                  D             		fd|                                 D             }n!t          |                                           }g }t	          | dd          }|r"t                      rddl}nt          d          |D ]}|j        s|s|rt          ||j
        j                  rrt          |d          r|                                }nt          |d	          r|j        j        }nd
}|                    |                                dz  |z             |                    |                                           t%          |          S )a  
        Get number of (optionally, trainable or non-embeddings) parameters in the module.

        Args:
            only_trainable (`bool`, *optional*, defaults to `False`):
                Whether or not to return only the number of trainable parameters

            exclude_embeddings (`bool`, *optional*, defaults to `False`):
                Whether or not to return only the number of non-embeddings parameters

        Returns:
            `int`: The number of parameters.
        c                 R    g | ]$\  }}t          |t          j                  | d %S ).weight)r  r   	Embedding)r   r   module_types      r|   r   z3ModuleUtilsMixin.num_parameters.<locals>.<listcomp>  sJ     % % %%6T;R\]hjljvRwRw%   % % %r{   c                 "    g | ]\  }}|v	|S rz   rz   )r   r   r   embedding_param_namess      r|   r   z3ModuleUtilsMixin.num_parameters.<locals>.<listcomp>  s/          -dIDXmLmLm	LmLmLmr{   is_loaded_in_4bitFr   Nzbitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. r|  quant_storager   r  )rq  r  r   r   r   rD   bitsandbytesr   requires_gradr  r   
Params4bitr  r|  rb  itemsizer  r  sum)
r#  rX  rY  total_parameterstotal_numelra  bnbr  	num_bytesr`  s
            @r|   num_parameterszModuleUtilsMixin.num_parameters  s     	7% %:>:L:L:N:N% % %!       151F1F1H1H       $DOO$5$566#D*=uEE 	(** ***** p  
 & 	6 	6E" 6. 6 % 	6E36;L)M)M 	6un55 &$)$6$6$8$8		 88 &$)$7$@		$%	&&u{{}}q'89'DEEEE&&u{{}}555;r{   
input_dictc                     t          | d          si | _        | j        |v r|| j                                                 S d| j        vr$t                              d           d| j        d<   dS )z
        Helper function to estimate the total number of tokens from the model inputs.

        Args:
            inputs (`dict`): The model inputs.

        Returns:
            `int`: The total number of tokens.
        warnings_issuedestimate_tokenszdCould not estimate the number of tokens of the input, floating-point operations will not be computedTr   )r  ro  main_input_namer  r   r  )r#  rm  s     r|   rp  z ModuleUtilsMixin.estimate_tokens  s     t.// 	&#%D :--d2399;;;d&:::NNv   7;D !23qr{   Tc                 `    d|                      |          z  |                     |          z  S )a  
        Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
        batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
        tokens (valid if `12 * d_model << sequence_length`) as laid out in [this
        paper](https://arxiv.org/pdf/2001.08361.pdf) section 2.1. Should be overridden for transformers with parameter
        re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.

        Args:
            batch_size (`int`):
                The batch size for the forward pass.

            sequence_length (`int`):
                The number of tokens in each line of the batch.

            exclude_embeddings (`bool`, *optional*, defaults to `True`):
                Whether or not to count embedding and softmax operations.

        Returns:
            `int`: The number of floating-point operations.
           )rY  )rp  rl  )r#  rm  rY  s      r|   floating_point_opsz#ModuleUtilsMixin.floating_point_ops  s4    0 4''
333d6I6I]o6I6p6pppr{   r   )NNFFFT)r   
__module____qualname____doc__staticmethodr  r  r$  r!  propertyrs   r   r   r   r2  rH  r   r   r   rM  r   r   rT  rR  rl  r   r   r   r	   rp  rt  rz   r{   r|   r   r     sm         	 	 \	   \
( 
( 
(+ + + * * * * X* )u{ ) ) ) X)/F /v / / / /0 ' ' ' \'8 rv2' 2'$2'38:2'GL|2'chcn2'	2' 2' 2' 2'j af !&)>AY]	   6	 	 	4  4 T 4 t 4 `c 4  4  4  4 l$sE%,:K4L/L*M RU    , [_q qsE%,*;$<<=qSWq	q q q q q qr{   c                   H    e Zd ZdZdZdZdZdZdZdZ	dZ
dZdZdZdZdZdZdZdZdZdZdZdZdZedeeej        f         fd            Zedefd            Zd	ef fd
Z d Z!d Z"d Z#de$e%e         ef         ddfdZ&e'd             Z(e'	 	 	 	 dode)de*ej+                 de*e$eeee,f         f                  de)fd            Z-e'dej+        dej+        fd            Z.ede/j0        fd            Z1e'de)fd            Z2e'	 	 	 	 dpde*ej+                 de*e$eeee,f         f                  de)de)def
d            Z3e'dqde)defd            Z4d Z5d Z6de/j0        fd Z7d!e/j0        fd"Z8de/j0        fd#Z9d$ Z:d% Z;d& Z<e=d'e/j0        d(e/j0        d)ed*efd+            Z>d, Z?defd-Z@	 	 	 drd.e*e,         d/e*e,         d0e)de/jA        fd1ZBdsd2ZC	 	 	 drd3e/jA        d.e*e,         d/e*e,         d0e)de/jA        f
d4ZD	 	 	 dtd5e/jE        d.e*e,         d6e*e)         d0e)de/jE        f
d7ZFd8 ZG	 dqd9ZHd: ZId; ZJd<e,fd=ZKde$e/jA        eLe/jA                 f         fd>ZMd? ZNd@ee,e%e,         f         fdAZOdudBZPdeQfdCe)dDeRfdEZSdF ZTede)fdG            ZUddejV        ddHddddf	dIe$eeWjX        f         dJe)dKe*eY         dLeRdMe)dNe$e,ef         dOe)dPe*e         dQe*e$ee)f                  dRe)fdSZZ e[e\j]                   fdT            Z]dvdUZ^ e[ej/        j0        j_                   fdV            Z_ e[ej/        j0        j`                   fdW            Z` fdXZa fdYZbe'dddddddZddd[	d\e*e$eeWjX        f                  d	e*e$eeeWjX        f                  d]e*e$eeWjX        f                  d^e)d_e)d`e)dQe*e$ee)f                  daedbe)dce)dd fdd            Zce'	 	 	 	 	 	 	 	 	 	 	 	 dwde            ZddxdfZee=	 	 	 	 dydg            Zfe'dzdi            Zgd{djZhdk Zidl Zjedm             Zkeeldn                         Zm xZnS )|PreTrainedModela  
    Base class for all models.

    [`PreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
    downloading and saving models as well as a few methods common to all models to:

        - resize the input embeddings,
        - prune heads in the self-attention heads.

    Class attributes (overridden by derived classes):

        - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
          for this model architecture.
        - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model,
          taking as arguments:

            - **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the TensorFlow checkpoint.
            - **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to the model.
            - **path** (`str`) -- A path to the TensorFlow checkpoint.

        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
          classes of the same architecture adding modules on top of the base model.
        - **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization.
        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
          models, `pixel_values` for vision models and `input_values` for speech models).
    Nr   	input_idsFr   c                 8    dt          j        t                    iS )z^
        `Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
        r  )rs   rw  r3   r'  s    r|   dummy_inputszPreTrainedModel.dummy_inputsx  s    
 U\,7788r{   c                     dS )z@
        :str: Identifies that this is a PyTorch model.
        rY  rz   r'  s    r|   r[  zPreTrainedModel.framework  s	    
 tr{   rK  c                    t                                                       t          |t                    s*t	          d| j        j         d| j        j         d          t          |dd          s)|                     |t          j
                    d          }|| _        |j        | _        i | _        |                                 rt          j        |          nd | _        t%          j        | j        j                  | _        d S )NzParameter config in `zt(config)` should be an instance of class `PretrainedConfig`. To create a model from a pretrained model use `model = z(.from_pretrained(PRETRAINED_MODEL_NAME)`_attn_implementation_autosetF)torch_dtypecheck_device_map)super__init__r  r   r   r   r   r   _autoset_attn_implementationrs   get_default_dtyperK  name_or_pathro  can_generater   from_model_configgeneration_configr  _keep_in_fp32_modules)r#  rK  inputsr   r   s       r|   r  zPreTrainedModel.__init__  s   &"233 	^(? ^ ^ N3^ ^ ^   v=uEE 	66E$;$=$=PU 7  F "/!OSO`O`ObOb!l!1!CF!K!K!Khl &*Yt~/S%T%T"""r{   c                 V    |                                   |                                  dS )z
        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
        modules properly initialized (such as weight initialization).
        N)init_weights._backward_compatibility_gradient_checkpointingr'  s    r|   	post_initzPreTrainedModel.post_init  s.    
 	;;=====r{   c                 p    t          | dd          }|t          d          |                    |           S )z
        Potentially dequantize the model in case it has been quantized by a quantization method that support
        dequantization.
        r  Nz?You need to first quantize your model in order to dequantize it)r   r   
dequantize)r#  r  s     r|   r  zPreTrainedModel.dequantize  s>    
 t^T::^___&&t,,,r{   c                     | j         rAt          | j        dd          r-|                                  t	          | j        d           d S d S d S )Ngradient_checkpointingF)supports_gradient_checkpointingr   rK  gradient_checkpointing_enabledelattrr'  s    r|   r  z>PreTrainedModel._backward_compatibility_gradient_checkpointing  sc    / 	;GDKIach4i4i 	;..000DK!9:::::	; 	; 	; 	;r{   tagsc                     t          |t                    r|g}| j        g | _        |D ]%}|| j        vr| j                            |           &dS )a\  
        Add custom tags into the model that gets pushed to the Hugging Face Hub. Will
        not overwrite existing tags in the model.

        Args:
            tags (`Union[List[str], str]`):
                The desired tags to inject in the model

        Examples:

        ```python
        from transformers import AutoModel

        model = AutoModel.from_pretrained("google-bert/bert-base-cased")

        model.add_model_tags(["custom", "custom-bert"])

        # Push the model to your namespace with the name "my-custom-bert".
        model.push_to_hub("my-custom-bert")
        ```
        N)r  r   
model_tagsr  )r#  r  tags      r|   add_model_tagszPreTrainedModel.add_model_tags  si    , dC   	6D?" DO 	, 	,C$/))&&s+++	, 	,r{   c                    |                     dt          j                              }|                     dd          }d}||                     |          }t	          j        |          }|j        |j        }nd}|                     d|          |_        t          |dd          s| 	                    ||d|          }t                      rgddl}t                              d	           |j                            t!                      
          5   | |fi |}ddd           n# 1 swxY w Y   n	 | |fi |}|t          j        |           |S )z
        All context managers that the model should be initialized under go here.

        Args:
            torch_dtype (`torch.dtype`, *optional*):
                Override the default `torch.dtype` and load the model under this dtype.
        r  use_flash_attention_2FNattn_implementationr  )r  r  r  r   @Detected DeepSpeed ZeRO-3: activating zero.init() for this modelconfig_dict_or_path)r  rs   r  _set_default_torch_dtyper  deepcopy_attn_implementation_internal_attn_implementationr   r  r!   r  r   infor  Initr    set_default_dtype)	clsrK  r   r  r  
dtype_origr  r  rB  s	            r|   _from_configzPreTrainedModel._from_config  s    jj0G0I0IJJ &

+BE J J 
"55kBBJv&&/; #)"F"&&,jj1FH[&\&\#v=uEE 	55&;!&'	 6  F &'' 
	*KKZ[[[ $$9I9K9K$LL . .F--f--. . . . . . . . . . . . . . . C))&))E !#J///s   
D**D.1D.Tr  r  r  r  c                 T   d}t          |d          r|j        |j        dk    r|rt          d|j         d          t	          |j        t
                    s>|j        dvr5d|j         d}| j        r|d	z  }| j        r|d
z  }t          |dz             |j        }|D ]i}t	          t          ||          t                    rDt          ||          }	t	          |t
                    s|n|
                    |d          }
|
|	_        j|r!t                              d           d|_        |j        dk    r|                     |||d|           n|dv rt                      s|                     ||dnd          }t           j        j        k|j        dk    r`t           j                                        dk    r>t                              d           t           j        j                            d           n$t	          |t
                    rd|_        nd|_        d|_        |S )az  
        Automatically checks and dispatches to a default attention implementation. In order of priority:
            1. An implementation specified in `config._attn_implementation` (due for example to the argument attn_implementation="sdpa" in from_pretrained).
            2. DEPRECATED: if use_flash_attention_2 is set to `True` and `flash_attn` is available, flash attention. (`LlamaFlashAttention` for example)
            3. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example)
            4. The default model's implementation otherwise (`LlamaAttention` for example) .
        Nr  flash_attention_2zBoth attn_implementation="z" and `use_flash_attention_2=True` were used when loading the model, which are not compatible. We recommend to just use `attn_implementation="flash_attention_2"` when loading the model.)eagersdpar  z Specified `attn_implementation="zt"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)zT, `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)zf, `"attn_implementation=sdpa"` (implementation using torch.nn.functional.scaled_dot_product_attention)r   zThe model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.F)r  r  hard_check_onlyr  )Nr  T)r  r  r   zUsing the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends.r  )r  r  r  r   r  r  _supports_flash_attn_2_supports_sdpar   r   ry   r   warning_once_check_and_enable_flash_attn_2rL   _check_and_enable_sdpars   r   hipcudadevice_countbackendsenable_flash_sdpr  )r  rK  r  r  r  r  requested_attn_implementationmessager   
sub_configcurr_attn_implementations              r|   r  z,PreTrainedModel._autoset_attn_implementation  s   & )-%6:;; 	Q@d@p*.AAAF[A r1L r r r  
 f94@@ 
0VE` i F F
 OV=X  O  O  O- vuuG% H   H  HG 3/// -3,P)  	T 	TC'&#..0@AA T$VS11
 &&CTJJF116::3EE )
 <T
8  	> B   +>F'&*===..'% %!1 /     +n<<E[E]E]<//)F)NTX 0  F !-/699J++--11## }   #44U;;;5t<< 	2*.F''*1F'.2+r{   r   c                     |j         st          d| j         d| d          t                              d| j         d| d           t          j                    }t          j        |           |S )a  
        Change the default dtype and return the previous one. This is needed when wanting to instantiate the model
        under specific dtype.

        Args:
            dtype (`torch.dtype`):
                a floating dtype to set to.

        Returns:
            `torch.dtype`: the original `dtype` that can be used to restore `torch.set_default_dtype(dtype)` if it was
            modified. If it wasn't, returns `None`.

        Note `set_default_dtype` currently only works with floating-point types and asserts if for example,
        `torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception.
        zCan't instantiate z model under dtype=z' since it is not a floating point dtypezInstantiating z model under default dtype r   )r   r   r   r   r  rs   r  r  )r  r   r  s      r|   r  z(PreTrainedModel._set_default_torch_dtypeo  s    " & 	tS\ttettt   	VS\VVeVVVWWW,..
&&&r{   c                 .    t          | | j        |           S )z@
        `torch.nn.Module`: The main body of the model.
        )r   base_model_prefixr'  s    r|   
base_modelzPreTrainedModel.base_model  s    
 tT3T:::r{   c                    dt          | j                  v rdS t          | j                  t          | j                  v rdS | j        D ];}t	          |d          sdt          |          vr|                                r dS <dt          | j                  vr$t                              | j         d           dS dS )z
        Returns whether this model can generate sequences with `.generate()`.

        Returns:
            `bool`: Whether this model can generate sequences with `.generate()`.
        r   Tr  r~  u:   has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.F)	r   	__bases__r   generater  r  prepare_inputs_for_generationr   r  )r  bases     r|   r  zPreTrainedModel.can_generate  s     CM 2 2224s|CL 1 1114M 	 	D400  D		11d6G6G6I6I1tt C(I$J$JJJ< 	  	  	    4ur{   r  c                    | j         st          | j         d|j         d          t	                      s@d}d}t
          j                            d          t          | d|           t          j
        t
          j        	                    d                    }t          j	        j        rs|t          j
        d          k     rt          | d	| d
|           t          j                                        st          | d          t          | d|           t          j	        j        rC|t          j
        d          k     rt          | d| d
|           t          | d|           t!          | dd          }	|	rt          d          |t"                              d           nB|@|t          j        t          j        fvr&t"                              d| j         d| d           |rl|jt          j        d          j        j        dk    rHt          j                                        rt"                              d           nct          d          |rR|Pt1          |t2                    r;d|                                v sd|                                v rt          d          |sd|_        |S )a9  
        Checks the availability of Flash Attention 2 and compatibility with the current model.

        If all checks pass and `hard_check_only` is False, the method will set the config attribute `attn_implementation` to "flash_attention_2" so that the model can initialize the correct attention module.
        z does not support Flash Attention 2.0 yet. Please request to add support where the model is hosted, on its model hub page: https://huggingface.co/zk/discussions/new or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/newzVFlashAttention2 has been toggled on, but it cannot be used due to the following error:zPlease refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
flash_attnNz3 the package flash_attn seems to be not installed. r`  zY you need flash_attn package version to be greater or equal than 2.1.0. Detected version z. z\ Flash Attention 2 is not available on CPU. Please make sure torch can access a CUDA device.z% Flash Attention 2 is not available. z2.0.4z you need flash_attn package version to be greater or equal than 2.0.4. Make sure to have that version installed - detected version use_bettertransformerFzFlash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing model.reverse_bettertransformer()zwYou are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviourzcFlash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in z is aG  . You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`r   r  zYou are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.a-  You are attempting to use Flash Attention 2.0 with a model not initialized on GPU and with no GPU available. This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then moving it to GPU.r2  r  zYou are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to initialise the model on a GPU by passing a device_map that contains only GPU devices as keys.r  )r  r   r   _name_or_pathrE   	importlibutil	find_specr  r   rf  r
  rs   r  ru   r  r   r   r  r  r   emptyr   r  r  r  r   r  )
r  rK  r  r  r  r  prefaceinstall_messageflash_attention_version_is_bettertransformers
             r|   r  z.PreTrainedModel._check_and_enable_flash_attn_2  s    ) 	< n nW]Wkn n n   )** 	jnG pO~''55=!W"r"rap"r"rsss&-mI4F4N4N|4\4\&]&]#}! j*W]7-C-CCC%"  j  j  ~U  j  j  Yh  j  j   0022 j$"  A  A  A   &&h&hWf&h&hiii" j*W]7-C-CCC%"  U  U  i@  U  U  DS  U  U   &&h&hWf&h&hiii '-De L L  	 l    J    $U]EN<[)[)[N(+N N:EN N N    	
 2u{1~~7L7QU[7[7[z&&(( 
##M   
 !R   		&:t,, '*++----:;L;L;N;N1N1Np    	>*=F'r{   c                     |r;| j         st          | j         d          t                      st	          d          t                      r| j         s|S t          | dd          }|r|S |sd|_        |S )a  
        Checks the availability of SDPA for a given model.

        If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flash_attention_2" so that the model can initialize the correct attention module.
        a   does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet. Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`zSPyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1.r  Fr  )r  r   r   rK   r  r   r  )r  rK  r  r  s       r|   r  z&PreTrainedModel._check_and_enable_sdpa  s      
	%  | S S S  
 +,, !i   '(( 	0B 	M '-De L L  	M 	1*0F'r{   c                 d    d }|                                                      |          | _        dS )z
        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
        the model weights fixed.
        c                 0    |                     d           d S NT)requires_grad_)r   inputoutputs      r|   make_inputs_require_gradszMPreTrainedModel.enable_input_require_grads.<locals>.make_inputs_require_grads:  s    !!$'''''r{   N)get_input_embeddingsr   _require_grads_hook)r#  r  s     r|   enable_input_require_gradsz*PreTrainedModel.enable_input_require_grads4  s<    	( 	( 	( $(#<#<#>#>#T#TUn#o#o   r{   c                 8    | j                                          dS )z4
        Removes the `_require_grads_hook`.
        N)r  remover'  s    r|   disable_input_require_gradsz+PreTrainedModel.disable_input_require_grads?  s     	 '')))))r{   c                 l    t          | | j        |           }|| ur|                                S t          )z
        Returns the model's input embeddings.

        Returns:
            `nn.Module`: A torch module mapping vocabulary to hidden states.
        )r   r  r  NotImplementedError)r#  r  s     r|   r  z$PreTrainedModel.get_input_embeddingsE  s;     T4#94@@
T!!22444%%r{   r  c                 r    t          | | j        |           }|| ur|                    |           dS t          )z
        Set model's input embeddings.

        Args:
            value (`nn.Module`): A module mapping vocabulary to hidden states.
        N)r   r  set_input_embeddingsr  )r#  r  r  s      r|   r  z$PreTrainedModel.set_input_embeddingsR  sC     T4#94@@
T!!++E22222%%r{   c                     dS )z
        Returns the model's output embeddings.

        Returns:
            `nn.Module`: A torch module mapping hidden states to vocabulary.
        Nrz   r'  s    r|   get_output_embeddingsz%PreTrainedModel.get_output_embeddings_  s	     tr{   c                     dS )a]  
        Initialize the weights. This method should be overridden by derived class and is
        the only initialization method that will be called when loading a checkpoint
        using `from_pretrained`. Any attempt to initialize outside of this function
        will be useless as the torch.nn.init function are all replaced with skip.
        Nrz   r"  s     r|   r   zPreTrainedModel._init_weightsh  s	     	r{   c                 d    t          |dd          rdS |                     |           d|_        dS )zM
        Initialize the weights if they are not already initialized.
        rs  FNT)r   r   rs  r"  s     r|   _initialize_weightsz#PreTrainedModel._initialize_weightsq  s@     6/77 	F6"""$(!!!r{   c                 ,   t          | j        dd          r>|                                 }|(|                     ||                                            t          | j        dd          rnt          | j        dd          rXt          | | j                  rt          | | j                  } |                     | j        | j	        | j        d          }|| _
        |                                 D ]&}t          |d          r|                                 'dS )	z
        Tie the weights between the input embeddings and the output embeddings.

        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the
        weights instead.
        tie_word_embeddingsTNis_encoder_decoderFtie_encoder_decoderencoder_tie_weights)r   rK  r  _tie_or_clone_weightsr  r  r  _tie_encoder_decoder_weightsr  decoderr  r@  r  )r#  output_embeddingstied_weightsr   s       r|   tie_weightszPreTrainedModel.tie_weightsz  s!    4; 5t<< 	[ $ : : < < ,**+<d>W>W>Y>YZZZ4; 4e<< 		;VkmrAsAs 		;tT344 =tT%;<<<<dlD,BI L /;D+llnn 	& 	&Fv~.. &##%%%	& 	&r{   r  r  r  base_encoder_namec                    g }g |j         | j         k    r*t                              |j          d| j          d           	 	 	 ddt          j        dt          j        dt
          dt
          d	t          t
                   f
fd
 || |||           t          |          dk    rt                              d|            S )N and zZ are not equal. In this case make sure that all encoder weights are correctly initialized.r   r   decoder_pointerencoder_pointerro  r  uninitialized_encoder_weightsc                    t          | t          j                  rt          |t          j                  sJ |  d| d            t          | d          rt          |d          sJ | j        |_                            | | d           t          | d          r8t          |d          sJ                     | | d           | j        |_        d S |j        }| j        }	t          |	          dk    rct          |          dk    sJ d| d	|              fd
|	                                D             }
d}|	
                                D ]\  }}|                                rqt          t          |          |z             }|}t          |	|         t          ||                             s&t          |          t          |	          k    r|dz  }n||vr|dk    rt          d          |x}} |	|         ||         dz   |z   |||dz   | d| | d|            |
                    dz   |z              |t#          |
          z  }d S d S )Nr  z have to be of type nn.Moduler  r\  r  z.biasr   zEncoder module z does not match decoder module c                      h | ]
}d z   |z   S )/rz   )r   sub_namero  s     r|   rp  zkPreTrainedModel._tie_encoder_decoder_weights.<locals>.tie_encoder_to_decoder_recursively.<locals>.<setcomp>  s#    &k&k&k{S'88'C&k&k&kr{   r   i  zMax depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model.r  r   )depthtotal_encoder_nametotal_decoder_name)r  r   r   r  r  r  r  r  r   r   r   isdigitr   r   r  r   r  r   )r  r   ro  r  r  r  r  r  encoder_modulesdecoder_modulesall_encoder_weightsencoder_layer_posr   r   encoder_namedecoder_name"tie_encoder_to_decoder_recursivelyr  s     `             r|   r  zXPreTrainedModel._tie_encoder_decoder_weights.<locals>.tie_encoder_to_decoder_recursively  s3    ory99 Wj? ? W W!VVVVVW W W 11 99999)8)?&##'8$U:L$U$U$UVVV?F33 @"?F;;;;; ''+<(W>P(W(W(WXXX+:+?O(-6O-6O?##a''((1,,,f_ffUdff -,, 'l&k&k&kTcThThTjTj&k&k&k#$%!$3$9$9$;$; Q QLD&||~~ ;'*3t997H+H'I'I'+)/,*Go^jNkIlIlmm %ru+s s 11s2 s2 .2-$_44 (e  
 7;:|66'5'5#c)D0)5#ai.@+Q+Q<+Q+Q.@+Q+Q<+Q+Q	 	 	 	 (..{S/@</OPPPP-6I1J1JJ---Q ('r{   z;The following encoder weights were not tied to the decoder )r   r   r   )	r   r   r  r   r   r   r   r   r  )r  r  r  r  r  r  r  s        @@r|   r  z,PreTrainedModel._tie_encoder_decoder_weights  sE    46%"$ 111KK$ 6 67+< 6 6 6   !!A	K A	KYA	KYA	K A	K  #	A	K
 ,09A	K A	K A	K A	K A	K A	K A	KH 	+*W/1BDa	
 	
 	
 ,--11NNmNkmm   r{   c                    | j         j        r1t          j        |j                                                  |_        n|j        |_        t          |dd          [t          j                            |j	        j
        d|j        j        d         |j	        j        d         z
  fdd          |j	        _
        t          |d          rt          |d          r|j        |_        dS dS dS )zPTie or clone module weights depending of whether we are using TorchScript or notr  Nr   constantout_featuresnum_embeddings)rK  torchscriptr   r  r  cloner   
functionalpadr  r  r=  r  r  r  )r#  r  input_embeddingss      r|   r  z%PreTrainedModel._tie_or_clone_weights  s    ;" 	?')|4D4K4Q4Q4S4S'T'T$$'7'>$$fd33?*,-*;*;!&+%,2158I8N8TUV8WW + +"' $n55 	M'BRTd:e:e 	M-=-L***	M 	M 	M 	Mr{   c                    t                      }| g}t          |          dk    r|                    d          }|j        j        |vrvt          |t                    r=|j        t          |j        j         d| d          |t          |j                  z  }|t          |
                                          z  }t          |          dk    t          |          S )a  
        Get the modules of the model that should not be spit when using device_map. We iterate through the modules to
        get the underlying `_no_split_modules`.

        Args:
            device_map (`str`):
                The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"]

        Returns:
            `List[str]`: List of modules that should not be split
        r   r   Nz does not support `device_map='z_'`. To implement support, the model class needs to implement the `_no_split_modules` attribute.)r:  r   r  r   r   r  r~  _no_split_modulesr   r   children)r#  r  r  modules_to_checkr   s        r|   _get_no_split_modulesz%PreTrainedModel._get_no_split_modules  s     EE 6"##a''%))"--F(0AAAfo66 ^/7(%/8 Z ZYc Z Z Z  
 ->FD\@]@],]) D):):$;$;;  "##a'' %&&&r{   new_num_tokenspad_to_multiple_ofmean_resizingc                    |                      |||          }|||S t          | d          o| j        du}t                      rR|sPddl}|j                            |j        d          5  |j        j        d         }ddd           n# 1 swxY w Y   n|j        j        d         }|| j	        
                                _        || _        |                                  |S )a$	  
        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.

        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.

        Arguments:
            new_num_tokens (`int`, *optional*):
                The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to
                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

        Return:
            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
        Nr  r   r  )_resize_token_embeddingsr  r  r!   r  r  r  r  r=  rK  get_text_config
vocab_sizer  )r#  r  r   r!  model_embedsrW  r  r%  s           r|   resize_token_embeddingsz'PreTrainedModel.resize_token_embeddings   s@   H 44^EWYfgg!&8&@ t^44V9JRV9V%'' 	6 	622<3FVZ2[[ : :)06q9
: : : : : : : : : : : : : : : &,215J 4>##%%0$ 	s   ,BBBc                    |                                  }|                     ||||          }t          |d          r|j        }t	          ||           |j        j        }|                    |           |                     |           t          | d          o| j	        d u}|rt                      rR|sPdd l}	|	j                            |j        d           5  |j        j        d         }d d d            n# 1 swxY w Y   n|j        j        d         }|                                 | j        j        s|                                 }
t%          |
t&          j        j                  r|                     |
||          }n|                     |
||          }t          |
d          r|
j        }t	          ||           |
j        j        }|                    |           |                     |           |                                  S )N_hf_hookr  r   r  )r!  )r  _get_resized_embeddingsr  r)  r_   r  rd  r  r  r  r!   r  r  r  r=  r  rK  r  r  rs   r   r]  _get_resized_lm_headset_output_embeddings)r#  r  r   r!  old_embeddingsnew_embeddingshookold_embeddings_requires_gradrW  r  old_lm_headnew_lm_headold_lm_head_requires_grads                r|   r#  z(PreTrainedModel._resize_token_embeddings[  ss   224455N,>
 
 >:.. 	5!*D~t444'5'<'J$%%&BCCC!!.111t^44V9JRV9V ))++ @L @    ^66~7L\`6aa D D%3%:%@%CND D D D D D D D D D D D D D D "0!6!<Q!? %%''3DK<[34466K+ux'9:: r"::;fs:tt"77^cp7qq{J// 6"+";555(3(:(H%&&'@AAA&&{333((***s   C99C= C=r-  c           	      &   |Kt          |t                    st          d| d          ||j        j        d         }||z   dz
  |z  |z  }nt
                              d| d           ||S t          | d          o| j        du}t                      r\|sZddl
}|j                            |j        d	          5  |j                                        \  }}ddd           n# 1 swxY w Y   n|j                                        \  }}||k    rt                      s|S t          |t          j                  s:t!          d
t#          |           dt          j         dt          j         d          t          j        |||j        j        |j        j                  }	||k    r|s|                     |	           n||k    r|rt
                              d           ||z
  }
t                      rZ|sXddl
}|j                            |j        gd	          5  |                     ||	|||
           ddd           n# 1 swxY w Y   n|                     ||	|||
           t/          ||          }t                      rt|srddl
}|j        |	j        g}|j                            |d	          5  |j        j        d|ddf         |	j        j        d|ddf<   ddd           n# 1 swxY w Y   n+|j        j        d|ddf         |	j        j        d|ddf<   t                      r|sddl
}|j        |	j        g}|j                            |d	          5  |	j        |_        |	j        j        j        d         |_        |j        |dz
  |j        k     rd|_        ddd           n# 1 swxY w Y   nN|	j        j        |j        _        |	j        j        j        d         |_        |j        |dz
  |j        k     rd|_        |S )a	  
        Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
        initialized vectors at the end. Reducing the size will remove vectors from the end

        Args:
            old_embeddings (`torch.nn.Embedding`):
                Old embeddings to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the embedding matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                `torch.nn.Embedding` module of the model without doing anything.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
                details about this, or help on choosing the correct value for resizing, refer to this guide:
                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html


        Return:
            `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
            `new_num_tokens` is `None`
        Nz5Asking to pad the embedding matrix to a multiple of `z@`, which is not and integer. Please make sure to pass an integerr   r   zYou are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be a.  . This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tcr  r  zOld embeddings are of type , which is not an instance of zj. You should either use a different resize function or make sure that `old_embeddings` are an instance of r   r5  zThe new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`)r  r   r   r  r=  r   r  r  r  r!   r  r  r  sizer   r]  	TypeErrorr  r   r   r   r  (_init_added_embeddings_weights_with_meanr0  r  r  padding_idx)r#  r-  r  r   r!  rW  r  old_num_tokensold_embedding_dimr.  added_num_tokensnparamss                r|   r*  z'PreTrainedModel._get_resized_embeddings  s(   V )0#66   aL^  a  a  a   %!/!6!<Q!?-0BBQFK]]assNNKKD&4D D D   !!!t^44V9JRV9V%'' 	M 	M22>3HX\2]] Q Q4B4I4N4N4P4P1 1Q Q Q Q Q Q Q Q Q Q Q Q Q Q Q 1?0E0J0J0L0L-N-^++4N4P4P+!!.",77 	$d>.B.B $ $bdbn $ $L$ $ $   !(/ '-	
 
 
 N**=*~....n,,, =    .>)++ 
L 
    ^668M7N^b6cc  AA&8I>[k                
 =="N4E~Wg   //%'' 	R 	R$+^-BCF2262KK V V4B4I4NrPQrSTSTSTu4U%*2A2qqq51V V V V V V V V V V V V V V V 1?0E0J2A2qqq50QN!&rr111u-
 &'' 	2 	2$+^-BCF2262KK 6 6(6(=%0>0E0J0PQR0S- "-9~PQ?QUcUo>o>o15N.6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 *8)>)CN!&,:,A,F,LQ,ON))5>A;MQ_Qk:k:k-1*sI   >C''C+.C+ II
I
9,K11K58K5&AN77N;>N;r1  
transposedc           	         ||S t          | d          o| j        du}t                      r|sddl}|j                            |j        d          5  |s|j                                        n*|j                                                                        \  }}ddd           n# 1 swxY w Y   nI|s|j                                        n*|j                                                                        \  }}||k    rt                      s|S t          |t          j                  s:t          dt          |           dt          j         dt          j         d          |s||fn||f}	|j        du}
t          j        |	|
|j        j        |j        j        d	}||k    r|s|                     |           n||k    r|rt$                              d
           ||z
  }t                      r|sddl}|j        g}|
r||j        gz  }|j                            |d          5  |                     ||||||           |
r|                     |||           ddd           n# 1 swxY w Y   n3|                     ||||||           |
r|                     |||           t-          ||          }t                      rn|slddl}|j        |j        |j        |j        g}|j                            |d          5  |                     |||||
           ddd           n# 1 swxY w Y   n|                     |||||
           |S )a  
        Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
        vectors at the end. Reducing the size will remove vectors from the end

        Args:
            old_lm_head (`torch.nn.Linear`):
                Old lm head liner layer to be resized.
            new_num_tokens (`int`, *optional*):
                New number of tokens in the linear matrix.

                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
                `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults
                to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
                vocab_size` else `vocab_size, lm_head_dim`.
            mean_resizing (`bool`):
                Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and
                covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`.

                Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models,
                where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the
                old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings.
                Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html

        Return:
            `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is
            `None`
        Nr  r   r  z#Old language model head is of type r5  zg. You should either use a different resize function or make sure that `old_lm_head` are an instance of r   )r  r   r   a  The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`)r  r  r!   r  r  r  r  r6  r   r  r   Linearr7  r  r  r   r   r   r   r  %_init_added_lm_head_weights_with_mean"_init_added_lm_head_bias_with_meanr0  !_copy_lm_head_original_to_resized)r#  r1  r  r?  r!  rW  r  r:  old_lm_head_dimnew_lm_head_shapehas_new_lm_head_biasr2  r<  r>  num_tokens_to_copys                  r|   r+  z$PreTrainedModel._get_resized_lm_head 	  s   F !t^44V9JRV9V%'' 
	 
	22;3EUY2ZZ  5?bK&++---[EWEYEYE[E[E`E`EbEb 0               2<^"'')))ASAUAUAWAWA\A\A^A^ ,NO ^++4N4P4P++ry11 	!d;6G6G ! !gigp ! !I! ! !   FPv_n==VdfuUv*/t; i%%,$*	
 
 
 N**=*{++++n,,, =    .>)++ hL h    %,-' 1{/00F^66vT6RR l l>>#[/>Sceo   , l??[Zjkkkl l l l l l l l l l l l l l l ::o~O_ak   ( h;;KVfggg @@%'' 	 	!(+*:K<NP[P`aF2262KK  66.@*Nb                
 22[*<jJ^   s7   A
B))B-0B-,4I,,I03I0L++L/2L/c                 (   |j         j                            t          j                  }t          j        |d          }||z
  }|j        |z  |z  }	t          j                            |	          }
t          |	|	j        k    
                                o,t          j        |
           o|
dk    
                                          }|rut          j        j                            |d|	z            }|                    |f                              |j         j                  |j         j        d|z  d d d f<   d S |d d d f                             |d                              |j         j                  |j         j        d|z  d d d f<   d S )Nr   r6  &.>)covariance_matrix)sample_shaper   r   )r  r  r  rs   r   meanTlinalgeigvalsr   all
is_complexdistributionsmultivariate_normalMultivariateNormalsampler   r<  )r#  r-  r.  r;  r:  r<  old_embeddings_weightmean_embeddingsold_centered_embeddings
covarianceeigenvaluesis_covariance_psddistributions                r|   r8  z8PreTrainedModel._init_added_embeddings_weights_with_mean	  s    !/ 5 : = =em L L*%:CCC"7/"I,.1HH>Y
 l**:66 :<',,..pu7G7T7T3TpZehiZiYnYnYpYp
 
  	 .BUU4*3D V  L FREXEX.0 FY F Fb&,-- !&r,<'<'>'>'ABBB  aaa(//0@!DDGGH]Hcdd !&r,<'<'>'>'ABBBr{   c                    |r6|j         j        j        |j         _        |j         j        j        |j         _        |                     |||||           |r8|j         j        j        |j         _        |j         j        j        |j         _        d S d S r   )r  r  rN  r8  )r#  r1  r2  rE  r:  r<  r?  s          r|   rB  z5PreTrainedModel._init_added_lm_head_weights_with_mean	  s      	@&1&8&=&?K#&1&8&=&?K# 	55o~GW	
 	
 	
  	@&1&8&=&?K#&1&8&=&?K###	@ 	@r{   c                 4   t          j        |j        j        dt           j                  }t          j        |j        j        d                              t           j                  }|j        j        d|z  d                              |d|z             d S )Nr   )r7  r   r6  r   rJ  )rM  std)rs   rM  r  r  r   r`  r  r   )r#  r1  r2  r<  	bias_meanbias_stds         r|   rC  z2PreTrainedModel._init_added_lm_head_bias_with_mean	  s    J{/41EMRRR	9[-2;;;>>u}MMb#33556>>ISWZbSb>cccccr{   c                    |s,|j         j        d |d d f         |j         j        d |d d f<   n+|j         j        d d d |f         |j         j        d d d |f<   |r%|j        j        d |         |j        j        d |<   d S d S r   )r  r  r  )r#  r2  r1  rH  r?  rG  s         r|   rD  z1PreTrainedModel._copy_lm_head_original_to_resized	  s      	n>I>P>UViWiViklklklVl>mK#$7%7$7$:;;>I>P>UVWVWVWYlZlYlVl>mK#AAA':(:':$:;   	d9D9I9NObPbOb9cK!"5#5"5666	d 	dr{   new_num_position_embeddingsc           	      \    t          d| j         d| j         d| j        j         d          )Nz4`resize_position_embeddings` is not implemented for B`. To implement it, you should overwrite this method in the class  in `modeling_.py`r  r   rx  )r#  rd  s     r|   resize_position_embeddingsz*PreTrainedModel.resize_position_embeddings	  sV    !p4> p p26.p pPTP^Pip p p
 
 	
r{   c           	      \    t          d| j         d| j         d| j        j         d          )Nz1`get_position_embeddings` is not implemented for rf  rg  rh  ri  r'  s    r|   get_position_embeddingsz'PreTrainedModel.get_position_embeddings	  sV    !p p p26.p pPTP^Pip p p
 
 	
r{   c                     | j         j        r|                     | j         j                   t          r0|                     | j                   |                                  dS dS )z
        If needed prunes and maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any
        initialization logic in `_init_weights`.
        N)rK  pruned_headsprune_headsr   applyr  r  r'  s    r|   r  zPreTrainedModel.init_weights	  sk     ;# 	7T[5666 	JJt/000 	 	r{   heads_to_prunec                     |                                 D ]^\  }}t          | j        j                            |g                     t          |          z  }t          |          | j        j        |<   _| j                            |           dS )a  
        Prunes heads of the base model.

        Arguments:
            heads_to_prune (`Dict[int, List[int]]`):
                Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads
                to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on
                layer 1 and heads 2 and 3 on layer 2.
        N)r   r:  rK  rn  ry   r   r  _prune_heads)r#  rq  layerheadsunion_headss        r|   ro  zPreTrainedModel.prune_heads
  s     +0022 	@ 	@LE5dk6::5"EEFFUSK.2;.?.?DK$U++$$^44444r{   c                    | j         st          | j        j         d          |ddi}t	          j        t          fi |}dt          j        | j	                  j
        v }|s| 	                    d|           nC|                     t          | j	        d                     t                              d           t          | d	d
          r|                                  dS dS )az  
        Activates gradient checkpointing for the current model.

        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".

        We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of
        the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2

        Args:
            gradient_checkpointing_kwargs (dict, *optional*):
                Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function.
        z) does not support gradient checkpointing.Nuse_reentrantTr  )enablegradient_checkpointing_funcr  V  You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model._hf_peft_config_loadedF)r  r   r   r   	functoolsr   r   r  r  _set_gradient_checkpointingr   rp  r   r  r   r  )r#  gradient_checkpointing_kwargsrz  _is_using_old_formats       r|   r  z-PreTrainedModel.gradient_checkpointing_enable
  s    3 	d 7bbbccc(0-<d,C)&/&7
&d&dFc&d&d#  ''*;D<\*]*]*hh# 	,,DVq,rrrrJJwt?tLLLMMMNNH  
 41599 	.
 ++-----	. 	.r{   ry  rz  c                     d}t          | d          r|| _        || _        d}|                                 D ]"}t          |d          r||_        ||_        d}#|st	          | j        j         d          d S )NFr  Tz is not compatible with gradient checkpointing. Make sure all the architecture support it by setting a boolean attribute `gradient_checkpointing` to modules of the model that uses checkpointing.)r  _gradient_checkpointing_funcr  r@  r   r   r   )r#  ry  rz  is_gradient_checkpointing_setr   s        r|   r  z+PreTrainedModel._set_gradient_checkpointing=
  s    (-% 4122 	10KD-*0D',0)llnn 	5 	5Fv788 56Q306-04-, 	>* ] ] ]  	 	r{   c                 Z   | j         r|dt          j        | j                  j        v }|s|                     d           nCt
                              d           |                     t          | j        d                     t          | dd          r| 
                                 dS dS )z
        Deactivates gradient checkpointing for the current model.

        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        r  F)ry  r|  r{  r}  N)r  r  r  r  r   r   r  rp  r   r   r  )r#  r  s     r|   gradient_checkpointing_disablez.PreTrainedModel.gradient_checkpointing_disableS
  s     / 	S $+g.?@`.a.a.l#l ' S000>>>>L   

74#C5QQQRRR41599 	/,,.....	/ 	/r{   c                 X    t          d |                                 D                       S )z
        Whether gradient checkpointing is activated for this model or not.

        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
        activations".
        c              3   D   K   | ]}t          |d           o|j        V  dS )r  N)r  r  )r   ms     r|   r  z<PreTrainedModel.is_gradient_checkpointing.<locals>.<genexpr>r
  s6      mmYZ71677TA<Tmmmmmmr{   )r  r@  r'  s    r|   is_gradient_checkpointingz)PreTrainedModel.is_gradient_checkpointingj
  s.     mm^b^j^j^l^lmmmmmmr{   5GBsave_directoryis_main_processr   save_functionpush_to_hubr   safe_serializationr	  tokensave_peft_formatc           	        HI |                     dd          }|                     dd          }|-t          j        dt                     |	t	          d          |}	|	|	|d<   t          | dd          }t          | d	d          }|duo*t          |t                    o|                    |
          }|!|s|st	          d|j	        j
         d          d|v r)t          j        d           |                     d          }|rt                      st          d          t          j                            |          r t                               d| d           dS t          j        |d           |rw|                     dd          }|                     d|                    t          j        j                  d                   } | j        |fi |}|                     |          }t/          |           }t1          |          }t3          |                              d          d         |j        _        |j        j        g|j        _        d|j        _        | j         tC          | || j                   |r|s|j        "                                }| #                                rwtI          |          dk    rdt          j        d| dtJ                     |&                                D ]1\  }}tO          |j(        ||           tO          |j        |d           2|j        )                    |           | #                                r|j(        )                    |           |rt           *                    d           |+                                |
r@t           *                    d           i }&                                D ]\  }}||d| <   || ,                                }tI          |          dk    rt	          d           |d         }| j-        |         }|)                    |           i }t]          | d!          rtI          t_          | j0        1                                                    dk    rd"| j0        1                                v sd#| j0        1                                v rYt          j        d$           |2                                D ]0\  H}Hd%k    r|3                                } | D ]}||Hd| z   <   1|3                                th          r'tj          j6        j7        j8        D ]\  }!}" |!          | j9        #| j9        D ]}#|#:                                v r|#= |rtw          j<        tz                    }$&                                D ]p\  H}%t          |%t|          j?                  r)|$t          |%                   A                    H           H|$t          |%                   A                    H           qt]          | d!          r=t          |           }&|&r)|&d         IIfd&|$&                                D             }'n!i }'nd' |$&                                D             }'t          |           }(g })t_                      }*|'1                                D ]f}+|(bd},t          |+          D ]PHt          Hfd(|(D                       }-|-r1Hv r-|,dz  },|,tI          |+          k     r|*G                    H           Qgt          |'1                                          \  }.}/|/D ]HH         I                                H<    t          |.          \  }.}0|0D ]\}1|1K                    |*          }2|2D ]HH= |1L                    |*          }3tI          |3          dk    r|)A                    |3           ]|.r"|)A                    t_          |.                     tI          |)          dk    rt          d)|) d*          |s!|rt          nt          }4t          |4|          }4n|rt          nt          }4|4S                    d+d,          S                    d-d.          }5t          |5|/          }6d}7|6jU        r|6jV        |6jW        d0}7t          jX        |          D ]}8t          j        Y                    ||8          }9|4S                    d+d%          S                    d-d%          }:|8S                    d+d%          S                    d-d%          };t          j[        d1          }<|8\                    |:          ret          j                            |9          rF|8|6j]        :                                vr+|r)|<^                    |;          t          j_        |9           |6j]        &                                }=|rt          ja        |=d23          }=|=D ]\  }>}?fd4|?D             }@|rut          t          jd        d5          k     rt          d6t           d7          d8 |@D             }A|@D ]}B||B         }t          ||B|A          }A|A}@~At          jg                     |r3t          |@t          j        Y                    ||>          d9d:i;            ||@t          j        Y                    ||>                     |7>t          j        Y                    ||4          }Ct           *                    d<|C            n|rt          nt          }Dt          j        Y                    |t          |D|                    }Dt          |Dd=d>?          5 }Et          jm        |7d@dA          dBz   }F|En                    |F           ddd           n# 1 swxY w Y   t           *                    dC| dDtI          |6j]                   dE|D d           |rgt          || jp        |	|F          }G|Gq                    t          j        Y                    |dG                     | r                    |||||	H           dS dS )Ia  
        Save a model and its configuration file to a directory, so that it can be re-loaded using the
        [`~PreTrainedModel.from_pretrained`] class method.

        Arguments:
            save_directory (`str` or `os.PathLike`):
                Directory to which to save. Will be created if it doesn't exist.
            is_main_process (`bool`, *optional*, defaults to `True`):
                Whether the process calling this is the main process or not. Useful when in distributed training like
                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
                the main process to avoid race conditions.
            state_dict (nested dictionary of `torch.Tensor`):
                The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only
                save parts of the model or if special precautions need to be taken when recovering the state dictionary
                of a model (like when using model parallelism).
            save_function (`Callable`):
                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
                need to replace `torch.save` by another method.
            push_to_hub (`bool`, *optional*, defaults to `False`):
                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
                namespace).
            max_shard_size (`int` or `str`, *optional*, defaults to `"5GB"`):
                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
                We default it to 5GB in order for models to be able to run easily on free-tier google colab instances
                without CPU OOM issues.

                <Tip warning={true}>

                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
                which will be bigger than `max_shard_size`.

                </Tip>

            safe_serialization (`bool`, *optional*, defaults to `True`):
                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
            variant (`str`, *optional*):
                If specified, weights are saved in the format pytorch_model.<variant>.bin.
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            save_peft_format (`bool`, *optional*, defaults to `True`):
                For backward compatibility with PEFT library, in case adapter weights are attached to the model, all
                keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can
                disable this behaviours by setting `save_peft_format` to `False`.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
        use_auth_tokenNignore_metadata_errorsFrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.V`token` and `use_auth_token` are both specified. Please set only the argument `token`.r  r}  r  )r  zThe model is quantized with z and is not serializable - check out the warnings from the logger on the traceback to understand the reason why the quantized model is not serializable.save_configze`save_config` is deprecated and will be removed in v5 of Transformers. Use `is_main_process` instead.zR`safe_serialization` requires the `safetensors library: `pip install safetensors`.zProvided path (z#) should be a directory, not a fileTexist_okcommit_messagerepo_idr   r   r   )rK  r   zHMoving the following attributes in the config to the generation config: z. You are seeing this warning because you've set generation parameters in the model config, as opposed to in the generation config.zhDetected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved.zTo match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`.zbase_model.model.zMultiple active adapters detected, saving multiple active adapters is not supported yet. You can save adapters separately one by one by iteratively calling `model.set_adapter(adapter_name)` then `model.save_pretrained(...)`hf_device_mapr2  r  z|Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory exceeds the `shard_size` (5GB default)r   c                 R    i | ]#\  }}t          fd |D                        ||$S )c              3       K   | ]}|v V  	d S r   rz   )r   r   
tied_namess     r|   r  z=PreTrainedModel.save_pretrained.<locals>.<dictcomp>.<genexpr>^  s*      HnHn`dQ[I[HnHnHnHnHnHnr{   )r  )r   ptrr  r  s      r|   
<dictcomp>z3PreTrainedModel.save_pretrained.<locals>.<dictcomp>]  sR     # # #'1sESHnHnHnHnhmHnHnHnEnEn#U# # #r{   c                 @    i | ]\  }}t          |          d k    ||S r   r   )r   r  r  s      r|   r  z3PreTrainedModel.save_pretrained.<locals>.<dictcomp>c  s+    [[[jc5CPUJJYZNNsENNNr{   c              3   B   K   | ]}t          j        |          V  d S r   r   r   )r   patr   s     r|   r  z2PreTrainedModel.save_pretrained.<locals>.<genexpr>o  s/      -a-asbiT.B.B-a-a-a-a-a-ar{   z8The weights trying to be saved contained shared tensors z that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.r  z{suffix}.binr  z{suffix}.safetensors)filename_patternr   r	  z(.*?)-\d{5}-of-\d{5}zSaving checkpoint shardsdescc                 F    i | ]}||                                          S rz   )r  )r   rw  r   s     r|   r  z3PreTrainedModel.save_pretrained.<locals>.<dictcomp>  s,    SSSVZ/::<<SSSr{   rj   zxYou need accelerate version to be greater or equal than 0.31 to save models with offloaded parameters. Detected version z<. Please upgrade accelerate with `pip install -U accelerate`c                     i | ]}|d S r   rz   )r   r   s     r|   r  z3PreTrainedModel.save_pretrained.<locals>.<dictcomp>  s    #?#?#?D"#?#?#?r{   r\  rY  )r
  zModel weights saved in wr"  r#  r  )indent	sort_keys
z:The model is bigger than the maximum size per checkpoint (z) and is going to be split in z^ checkpoint shards. You can find where each parameters has been saved in the index located at )r  r  z	README.md)r  r  )sr  r8  r9  r:  r   r   r  r,   is_serializablequantization_configquant_methodrJ   r  rw   r4  r6  r   errormakedirsr  sep_create_repo_get_files_timestampsunwrap_modelr   r   rK  r  r   r   architecturesr  _auto_classr   &_get_non_default_generation_parametersr  r   UserWarningr   r   r  save_pretrainedr  get_adapter_state_dictactive_adapterspeft_configr  r:  r  r   rq  r   IS_SAGEMAKER_MP_POST_1_10smpstatemodule_managertranslate_functions_keys_to_ignore_on_saver   r  r  r   rs   r   r&   r  idrb   r  sortedr  r  r  r  r  intersection
differencer;  r6   r:   r  r0   r1   r  r   
is_shardedr
  tensor_to_filenamelistdirr5  r   compiler   filename_to_tensors	fullmatchr  rM   tqdmaccelerate_versionr   rf  rk   r>  r?  safe_save_filer5   r9   r7  r8  dumpswriterQ   r  save_upload_modified_files)Jr#  r  r  r   r  r  r   r  r	  r  r  r   r  r  r}  r  quantization_serializabler  r  files_timestampsmodel_to_saver   misplaced_generation_parametersr  param_valuepeft_state_dictr   r  active_adaptercurrent_peft_config
module_mapr   module_state_dict	smp_to_hfr  
ignore_keyptrsrw  tied_paramsshared_ptrsr  error_namesto_delete_namesr  foundmatches_patternshared_namesdisjoint_namesidentical_namesinamesknownunknownr  r  state_dict_splitr  filenamefull_filenameweights_no_suffixfilename_no_suffixregr  r  r  r  shard_state_dictro  path_to_weightssave_index_filerL  content
model_cardr   r  sJ      `                                                                    @@r|   r  zPreTrainedModel.save_pretrainedt
  s   ~  $4d;;!',De!L!L%M E     l   #E#F7O!(/G!O!Ot^T::$ T<55T,,@R,SS 	" #,B#Kd#u|/O/\ u u u  
 F""Mw   %jj77O 	t&>&@&@ 	trsss7>>.)) 	LL^>^^^___F
NT2222 	J#ZZ(8$??NjjN,@,@,M,Mb,QRRG'd'::6::G#99.II %T** $M22+.u::+;+;C+@+@+C( /<.E.N-O* =B9 't^DKHHHH  ,	D) E2?2F2m2m2o2o/$$&& 	H3/N+O+ORS+S+SMm:m m m $	   4S3X3X3Z3Z H H/
K ?[YYY 4j$GGGG$44^DDD  "" P/??OOO% D~   +AACC
# 1KK _   ')O&0&6&6&8&8 K K
UEJ(AC(A(ABB!0J!%!5!5!7!7~&&**$u   "0!2&*&6~&F##33NCCC 
  o..>D.55778899A==d0779999VtGYG`G`GbGb=b=b S   %2$?$?$A$A > >LD&rzz (.(9(9(;(;%0 > >7=
4)c))#344>&1133J % 	3 #	 8 L 3 3	1&Yz22

 '3": / /
!2!222":. F	 *400D * 0 0 2 2 2 2f fel33 2*6223::4@@@@ F$++D1111 t_-- \2488 %!,QJ# # # #59ZZ\\# # #KK #%KK[[DJJLL[[[ "7t!<!<K!eeO$++-- 
: 
: &1E &u : :*--a-a-a-aN`-a-a-a*a*a* :tz/A/A!QJE$s5zz11 / 3 3D 9 9 9+9+:L:L:N:NPZ+[+[(L. ' < <#-d#3#9#9#;#;
4   -<L*,U,U)L/) 0 0++O<<! ) )D"4(( ++O<<w<<!##&&w/// 6""3|#4#4555;!##" b{  b  b  b  
 & 	e0BT,,L'g>>LL8Jd44PdL'//GGOOP^`vww=)9.
 
 
 & 	,5.A E 
>22 	) 	)HGLLBBM !- 4 4VR @ @ H HY[ \ \ "*!1!1&"!=!=!E!EnVX!Y!Y*455C ##$566)GNN=11) $4$H$M$M$O$OOO# PMM"455A	-(((.BHHJJ 	e"),/BIc"d"d"d#6 	O 	OJSSSS7SSSE %f(=(===%V  Tf V V V  
 $@#?#?#?#? #( j jK'4F'B6;Xh'i'i$$ )$
! O ubgll>:&N&NZbdhYijjjjjeRW\\.*%M%MNNNN= gll><HHOKKC/CCDDDD9Kc55QcO gll><Y`;a;abbOosW=== !*U1EEEL   ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! KK7^ 7 7 0 DEE7 7$37 7 7    	2Nd  J
 OOBGLLEEFFF'' - (     	 	s   0oo	oc                     | j         | j         ng }|                    dg           }t          |t                    r|g}|D ]}||vr|                    |           |r||d<    t                      j        |i |S )Nr  )r  ry   r  r   r  r  r  )r#  r   r   r  tags_kwargsr  r   s         r|   r  zPreTrainedModel.push_to_hub  s    "&/"=t2jj,,k3'' 	(&-K 	! 	!C$C    	"!F6N"uww"D3F333r{   c                     t          d |                                 D                       }|r0t          d |                                 D                       }||z   }|S )a  
        Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
        Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
        PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2

        Arguments:
            return_buffers (`bool`, *optional*, defaults to `True`):
                Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
                are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
                norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
        c                 `    g | ]+}|                                 |                                z  ,S rz   ry  r|  )r   r  s     r|   r   z8PreTrainedModel.get_memory_footprint.<locals>.<listcomp>  s3    ZZZu5>>##e&8&8&:&::ZZZr{   c                 `    g | ]+}|                                 |                                z  ,S rz   r  )r   bufs     r|   r   z8PreTrainedModel.get_memory_footprint.<locals>.<listcomp>  s1    ZZZCCLLNNS-=-=-?-??ZZZr{   )rg  r   r   )r#  return_buffersr  mem_bufss       r|   get_memory_footprintz$PreTrainedModel.get_memory_footprint	  sd     ZZHYHYZZZ[[ 	!ZZ4<<>>ZZZ[[H.C
r{   c                    t          | dd           t          j        k    rt          d          t          | dd           t          j        k    rt          | dd          rt          d          t          j        t          j                            d                    t          j        d          k     rt          d| j	         d	          d S  t                      j        |i |S )
Nquantization_methodz2`.cuda` is not supported for HQQ-quantized models.is_loaded_in_8bitFzCalling `cuda()` is not supported for `8-bit` quantized models.  Please use the model as it is, since the model has already been set to the correct devices.rc  0.43.2zCalling `cuda()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. The current device is `L`. If you intended to move the model, please install bitsandbytes >= 0.43.2.)r   rX   HQQr   BITS_AND_BYTESr   rf  r  r
  r   r  r  )r#  r   r   r   s      r|   r  zPreTrainedModel.cuda  s    4.559K9OOOQRRR4.559K9ZZZt0%88 	 s   y199.IIJJW][cMdMddd H.2kH H H   ed  577<0000r{   c                    d|v }|s#|D ] }t          |t          j                  rd} n!t          | dd           t          j        k    rt          d          t          | dd           t          j        k    r|rt          d          t          | dd          rt          d          t          j	        t          j                            d	                    t          j	        d
          k     rt          d| j         d          n0t          | dd           t          j        k    r|rt          d           t                      j        |i |S )Nr   Tr
  z0`.to` is not supported for HQQ-quantized models.zYou cannot cast a bitsandbytes model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `torch_dtype` argument.r  Fz`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.rc  r  zCalling `to()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. The current device is `r  zYou cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired `dtype` by passing the correct `torch_dtype` argument.)r  rs   r   r   rX   r  r   r  r   rf  r  r
  r   GPTQr  r  )r#  r   r   dtype_present_in_argsargr   s        r|   r  zPreTrainedModel.to.  s    !(6 1$ 	  c5;// ,0)E 4.559K9OOOOPPP4.559K9ZZZ$  V  
 t0%88 	 l   y199.IIJJW][cMdMddd H.2kH H H   e
 T0$77;M;RRR$  N   uwwz4*6***r{   c                 p    t          | dd          rt          d           t                      j        | S )NrW  Fz`.half()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r   r   r  halfr#  r   r   s     r|   r  zPreTrainedModel.halfV  sD    4// 	'I  
  577<&&r{   c                 p    t          | dd          rt          d           t                      j        | S )NrW  Fz`.float()` is not supported for quantized model. Please use the model as it is, since the model has already been casted to the correct `dtype`.)r   r   r  r   r  s     r|   r   zPreTrainedModel.float`  sD    4// 	(I  
 !577=$''r{   main)	rK  	cache_dirignore_mismatched_sizesforce_downloadlocal_files_onlyr  revisionuse_safetensorsr0  r  r  r  r  r  r  r  r0  c       	         5  X |                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     d	d          }|                     d
d          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          }|                     dd          } |                     dd          }!|                     dd          }"|                     dd          }#|                     dd          }$|                     dd          }%|                     di           }&|                     dd           }'|                     d!d          }(|                     d"d          })|                     d#d          }*d}+t                      rd}|-t          j        d$t                     |t          d%          |}||&	d&|&vr||&d&<   |	t                      sd}	|du rt                              d'           |*t                      st          d(          |$Xt          |t                    s2t          |t          ||||||||#ddd)          },t          |,|$          }$nt          |dd          }$t!                      r|&                     d*d          }-|-t#          |f||||||$d+|&}-|-et$          j                            |-          rFt+          |-d,d-.          5 }.|}-t-          j        |.          d/         }ddd           n# 1 swxY w Y   nd}-t          |t0          j                  rd|i}nt          |t4                    r=|d0vr9	 dt1          j        |          i}nO# t6          $ r t          d1| d2          w xY wt          |t8                    r|d3k     rt          d4          d|i}||d}n|st          d5          |rCt;                      rt          d6          t                      st=          d7t>           d8          |!s| ri|"t          d9          d: |                                 D             }/i |/|!| d;}/tC          j"        d|/dd<|\  }"}t                              d=           ||z   }0d>d?|d@}1|||1dA<   tG                      r|st          $                    dB           d}t          |t                    s)||n|}2 | j%        j&        |2f|d|||||||#||dC|\  }}3n5tO          j(        |          }|                     dDd          }4|4|4|_)        |}3t          |dd          du}5|5s|"E|5r tU          j+        |j,        |"          |_,        n|"|_,        tU          j-        |j,        |5E          }6nd}6|6u|6.                    ||||F           |6/                    |          }|60                    |          }|6j,        j1        j2        |1dG<   |d}t                              dH           |6du}7d}8d}9d}:dXd};|*|6t          dI          |	|*	t5          |          }t$          j        3                    |          }<|<r |rqt$          j                            t$          j        4                    ||#tj          dJz                       r+t$          j        4                    ||#tj          dJz             }=n|rkt$          j                            t$          j        4                    ||#tl                              r(t$          j        4                    ||#tl                    }=n|rkt$          j                            t$          j        4                    ||#tn                              r(t$          j        4                    ||#tn                    }=n|	durt$          j                            t$          j        4                    ||#tq          tr          |%                              r6t$          j        4                    ||#tq          tr          |%                    }=n|	durt$          j                            t$          j        4                    ||#tq          tt          |%                              r8t$          j        4                    ||#tq          tt          |%                    }=d}8n|	st$          j                            t$          j        4                    ||#tq          tv          |%                              r6t$          j        4                    ||#tq          tv          |%                    }=ny|	st$          j                            t$          j        4                    ||#tq          tx          |%                              r8t$          j        4                    ||#tq          tx          |%                    }=d}8n|	st$          j                            t$          j        4                    ||#tj          dJz                       sCt$          j                            t$          j        4                    ||#tl                              r)t{          dKtq          tv          |%           dL| dM          |	slt$          j                            t$          j        4                    ||#tn                              r)t{          dKtq          tv          |%           dL| dN          |	r)t{          dKtq          tr          |%           dL| d2          t{          dKtq          tv          |%           dOtq          tr          |%           dOtl           dOtj          dJz    dPtn           dL| d2          t$          j                            t$          j        4                    |#|                    r|}=d}<nt$          j                            t$          j        4                    |#|dJz                       r?|st          dQ|dJz    dR          t$          j        4                    |#|dJz             }=d}<nt}          |          r|}>t          |          }?nc|rtl          }>n9|rtn          }>n/|	durtq          tr          |%          }>ntq          tv          |%          }>	 |||||||1||#dd|$dS}@t          ||>fi |@}?|?|>tq          tr          |%          k    rt          |tq          tt          |%          fi |@}?|?d}8n|	r]|dTk    rt          |fi |@\  }?}}8||@dU<   |?>t{          | dVtq          tr          |%           dPtq          tt          |%           dW          n#tq          tv          |%          }>t          ||>fi |@}?|?>|>tq          tv          |%          k    r%t          |tq          tx          |%          fi |@}?|?d}8|stG                      s|?u|>tv          tx          fv rc|8rtt          ntr          }A|||||dX}B|||||1|#dd|$dY	|B}@t          ||Afi |Bs/t          t          |fdZdi|@d[\          C                                 n|||||dX}Bt          |tl          fi |Br(t{          | dVtq          tv          |%           dM          t          |tn          fi |Br(t{          | dVtq          tv          |%           dN          |%>t          |tv          fi |Br+t{          | dVtq          tv          |%           d]|% d^          t{          | dVtq          tv          |%           dOtq          tr          |%           dOtl           dOtj           dPtn           d2          na# tz          $ r  t          $ rJ}Ct{          d_| d`| datq          tv          |%           dOtl           dOtj           dPtn           d2          |Cd}C~Cww xY w|<r t          $                    db|=            |=}?nt          $                    db|> dc|?            na|*r]dddelEmF}D t$          j                            |*          r|*}+n|||||||1||#dd|$dS}@t          ||*fi |@}+ |D|+df          dg         }d}?d}8nd}?|8rt          ||?|||||||1||#|$h          \  }?}9t                      r.t          |?t4                    r|?H                    di          rt          |?djk          5 }.|.J                                }Eddd           n# 1 swxY w Y   |EK                    dl          djk    rn|EK                    dl          dmk    rd}t          $                    dn           nu|EK                    dl          dok    rd}t          $                    dp           n?|EK                    dl          dqk    rn%t          dr|EK                    dl                     ||z   }0|0r|8s|t          |?|
s          }d}F|t          |t4                    r|dtk    rt          |d          r-|jN        &|jN        }t          $                    du| dv           n|8rdw|9v r	|9dw         }n9|8st          |          }n't          |?d3         |
s          }Gt          |G          }~Gt          $                    dx           n=t          t0          |          rt          t0          |          }nt          dy|           | P                    |          }F| jQ        duo|t0          jR        k    pt          |6dz          };|8r	|9d{         }Hn!t          |T                                          }H|+|s|;rt                      rd}||_U        t          ||          g}It;                      rL|7sJd3dlW}Jt          $                    d}           |JjX        Y                    t                      ~          g|Iz   }InI|rGt                      st=          d7t>           d8          |I[                    t                                 tO          j(        |          }t          |dd          s| ]                    ||(||          }t          |I          5   | |g|R i |3}Kddd           n# 1 swxY w Y   |Kj_        }|;r&t                      rt;                      sd}|KjQ        Xng X|6|6`                    |K|X           ||_a        t          |t4                    r{i }L|6)|Lb                    |6c                    |K|                     |Lb                    Xfd|Kd                                D                        |}M|6|6e                    |M          }M|Kf                    |          }N|d0vrt          d          d|Ni}Odt          jh        t                    jj        v r|L|Od<   n-t          |L          d3k    rt                              d           |dk    rt          |Kf|M|dk    |d|O}nt          |          }|6|6n                    |          }||Od<   |Ko                                 t          |Kfdw|Mi|O}|6|6.                    |           n5|3|Ko                                 t          |K          }Pt          |P|           |rx|?H                    dJ          r | r                    |K||?dd                   }Kn	 dddlsmt}Q  |Q|K|?dd          \  }K}:n# t<          $ r t          u                    d            w xY w|r=	 dddlvmw}R  |R|K|?          }Knn# t<          $ r t          u                    d            w xY w|0rC|Ft1          jx        |F           | y                    |K||H|?|||9|||||||6X|+|
          \  }K}S}T}U}V}W|Ko                                 |Kz                                 |K{                                rN|)Lt          $                    d           |Kj|        "                    |)}                                          |K_|        nd|K{                                rP|N	 t          j&        |f||||||||#||d
||K_|        n*# t          $ r t          $                    d           Y nw xY w||||V|d}Odt          jh        t                     jj        v r
|Kj        |Od<   dt          jh        t                     jj        v r"|6 |6j,        j1        t          j        k    rd|Od<   |6b|6j,        j1        t          j        k    rGt          |t
                    r1d|                                v sd|                                v rd|Od<   t                      st;                      st          |Kfi |O |6|6                    |K           |6|K_        |-|K                    |-|'||&           |r|:|S|T|U|Wd}:|K|:fS |KS )aB  
        Instantiate a pretrained pytorch model from a pre-trained model configuration.

        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
        the model, you should first set it back in training mode with `model.train()`.

        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
        task.

        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
        weights are discarded.

        If model weights are the same precision as the base model (and is a supported model), weights will be lazily loaded
        in using the `meta` device and brought into memory once an input is passed through that layer regardless of
        `low_cpu_mem_usage`.

        Parameters:
            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
                Can be either:

                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                    - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
                      `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to
                      `True`.
                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
                      arguments `config` and `state_dict`).
            model_args (sequence of positional arguments, *optional*):
                All remaining positional arguments will be passed to the underlying model's `__init__` method.
            config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*):
                Can be either:

                    - an instance of a class derived from [`PretrainedConfig`],
                    - a string or path valid as input to [`~PretrainedConfig.from_pretrained`].

                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                be automatically loaded when:

                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the
                      save directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            state_dict (`Dict[str, torch.Tensor]`, *optional*):
                A state dictionary to use instead of a state dictionary loaded from saved weights file.

                This option can be used if you want to create a model from a pretrained configuration but load your own
                weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and
                [`~PreTrainedModel.from_pretrained`] is not a simpler option.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            from_tf (`bool`, *optional*, defaults to `False`):
                Load the model weights from a TensorFlow checkpoint save file (see docstring of
                `pretrained_model_name_or_path` argument).
            from_flax (`bool`, *optional*, defaults to `False`):
                Load the model weights from a Flax checkpoint save file (see docstring of
                `pretrained_model_name_or_path` argument).
            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
                checkpoint with 3 labels).
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                cached versions if they exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            output_loading_info(`bool`, *optional*, defaults to `False`):
                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
            local_files_only(`bool`, *optional*, defaults to `False`):
                Whether or not to only look at local files (i.e., do not try to download the model).
            token (`str` or `bool`, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.

                <Tip>

                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.

                </Tip>

            mirror (`str`, *optional*):
                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
                Please refer to the mirror site for more information.
            _fast_init(`bool`, *optional*, defaults to `True`):
                Whether or not to disable fast initialization.

                <Tip warning={true}>

                One should only disable *_fast_init* to ensure backwards compatibility with `transformers.__version__ <
                4.6.0` for seeded model initialization. This argument will be removed at the next major version. See
                [pull request 11471](https://github.com/huggingface/transformers/pull/11471) for more information.

                </Tip>
            attn_implementation (`str`, *optional*):
                The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), or `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.

            > Parameters for big model inference

            low_cpu_mem_usage(`bool`, *optional*):
                Tries not to use more than 1x model size in CPU memory (including peak memory) while loading the model.
                Generally should be combined with a `device_map` (such as `"auto"`) for best results.
                This is an experimental feature and a subject to change at any moment.
                </Tip>
                    If the model weights are in the same precision as the model loaded in, `low_cpu_mem_usage` (without
                    `device_map`) is redundant and will not provide any benefit in regards to CPU memory usage. However,
                    this should still be enabled if you are passing in a `device_map`.
                </Tip>
            torch_dtype (`str` or `torch.dtype`, *optional*):
                Override the default `torch.dtype` and load the model under a specific `dtype`. The different options
                are:

                1. `torch.float16` or `torch.bfloat16` or `torch.float`: load in a specified
                  `dtype`, ignoring the model's `config.torch_dtype` if one exists. If not specified
                  - the model will get loaded in `torch.float` (fp32).

                2. `"auto"` - A `torch_dtype` entry in the `config.json` file of the model will be
                  attempted to be used. If this entry isn't found then next check the `dtype` of the first weight in
                  the checkpoint that's of a floating point type and use that as `dtype`. This will load the model
                  using the `dtype` it was saved in at the end of the training. It can't be used as an indicator of how
                  the model was trained. Since it could be trained in one of half precision dtypes, but saved in fp32.

                3. A string that is a valid `torch.dtype`. E.g. "float32" loads the model in `torch.float32`, "float16" loads in `torch.float16` etc.

                <Tip>

                For some models the `dtype` they were trained in is unknown - you may try to check the model's paper or
                reach out to the authors and ask them to add this information to the model's card and to insert the
                `torch_dtype` entry in `config.json` on the hub.

                </Tip>

            device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*):
                A map that specifies where each submodule should go. It doesn't need to be refined to each
                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
                same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank
                like `1`) on which the model will be allocated, the device map will map the entire model to this
                device. Passing `device_map = 0` means put the whole model on GPU 0.

                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
                more information about each option see [designing a device
                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
            max_memory (`Dict`, *optional*):
                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
                GPU and the available CPU RAM if unset.
            offload_folder (`str` or `os.PathLike`, *optional*):
                If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
            offload_state_dict (`bool`, *optional*):
                If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU
                RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to
                `True` when there is some disk offload.
            offload_buffers (`bool`, *optional*):
                Whether or not to offload the buffers with the model parameters.
            quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*):
                A dictionary of configuration parameters or a QuantizationConfigMixin object for quantization (e.g
                bitsandbytes, gptq). There may be other quantization-related kwargs, including `load_in_4bit` and
                `load_in_8bit`, which are parsed by QuantizationConfigParser. Supported only for bitsandbytes
                quantizations and not preferred. consider inserting all such arguments into quantization_config
                instead.
            subfolder (`str`, *optional*, defaults to `""`):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
                specify the folder name here.
            variant (`str`, *optional*):
                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
                ignored when using `from_tf` or `from_flax`.
            use_safetensors (`bool`, *optional*, defaults to `None`):
                Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors`
                is not installed, it will be set to `False`.

            weights_only (`bool`, *optional*, defaults to `True`):
                Indicates whether unpickler should be restricted to loading only tensors, primitive types,
                dictionaries and any types added via torch.serialization.add_safe_globals().
                When set to False, we can load wrapper tensor subclass weights.

            kwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                automatically loaded:

                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
                      corresponds to a configuration attribute will be used to override said attribute with the
                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
                      will be passed to the underlying model's `__init__` function.

        <Tip>

        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
        use this method in a firewalled environment.

        </Tip>

        Examples:

        ```python
        >>> from transformers import BertConfig, BertModel

        >>> # Download model and configuration from huggingface.co and cache.
        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased")
        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
        >>> model = BertModel.from_pretrained("./test/saved_model/")
        >>> # Update configuration during loading.
        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True)
        >>> assert model.config.output_attentions == True
        >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
        >>> config = BertConfig.from_json_file("./tf_model/my_tf_model_config.json")
        >>> model = BertModel.from_pretrained("./tf_model/my_tf_checkpoint.ckpt.index", from_tf=True, config=config)
        >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
        >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", from_flax=True)
        ```

        * `low_cpu_mem_usage` algorithm:

        This is an experimental function that loads the model using ~1x model size CPU memory

        Here is how it works:

        1. save which state_dict keys we have
        2. drop state_dict before the model is created, since the latter takes 1x model size CPU memory
        3. after the model has been instantiated switch to the meta device all params/buffers that
        are going to be replaced from the loaded state_dict
        4. load state_dict 2nd time
        5. replace the params/buffers from the state_dict

        Currently, it can't handle deepspeed ZeRO stage 3 and ignores loading errors

        r   Nfrom_tfF	from_flaxresume_downloadproxiesoutput_loading_infor  trust_remote_codemirror_from_pipeline
_from_auto
_fast_initTr  low_cpu_mem_usager  
max_memoryr  offload_state_dictoffload_buffersload_in_8bitload_in_4bitr  	subfolderr   _commit_hashr	  adapter_kwargsadapter_namedefaultr  r  	gguf_filer  r  r  zgThe argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.zIaccelerate is required when loading a GGUF file `pip install accelerate`.)r  r  r"  r#  r  r  r  r0   _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors_adapter_model_path)r  r  r"  r#  r  r1  r!  r"  r#  base_model_name_or_path)autobalancedbalanced_low_0
sequentialzWhen passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or 'auto', 'balanced', 'balanced_low_0', 'sequential' but found r   r   znYou can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' z>Passing along a `device_map` requires `low_cpu_mem_usage=True`z`DeepSpeed Zero-3 is not compatible with `low_cpu_mem_usage=True` or with passing a `device_map`.z`Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=z'`zwYou can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time.c                 Z    i | ](\  }}|t          j        t                    j        v %||)S rz   )r  r  rW   r   r   s      r|   r  z3PreTrainedModel.from_pretrained.<locals>.<dictcomp>  s7    tttDAqa7CTUgChChCs>s>s1a>s>s>sr{   )r/  r.  )config_dictreturn_unused_kwargszThe `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.rB  pytorch)	file_typer[  from_auto_classusing_pipelinez+Offline mode: forcing local_files_only=True)r  rA  r  r"  r#  r  r  r  r0  r(  r'  r  )pre_quantized)r  r   r!  r  quantzK`low_cpu_mem_usage` was None, now default to True since model is quantized.zYou cannot combine Quantization and loading a model from a GGUF file, try again by making sure you did not passed a `quantization_config` or that you did not load a quantized model from the Hub.z.indexzError no file named z found in directory zf but there is a file for TensorFlow weights. Use `from_tf=True` to load this model from those weights.zb but there is a file for Flax weights. Use `from_flax=True` to load this model from those weights.z, r   z$We found a TensorFlow checkpoint at z:, please set from_tf to True to load from this checkpoint.)r  r  r#  r"  r  r  
user_agentr  r0  r6  r7  r1  r  r  z& does not appear to have a file named z and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`.)r  r#  r  r  r  )	r  r  r"  r  rH  r0  r6  r7  r1  ignore_errors_during_conversionzThread-autoconversion)targetr   r   r   z) but there is a file without the variant z;. Use `variant=None` to load this model from those weights.zCan't load the model for 'z'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z=' is the correct path to a directory containing a file named zloading weights file z from cache at r   )load_gguf_checkpoint)return_tensorsr  )
r  r  r#  r"  r  r  rH  r  r0  r1  r  rY  rZ  r\  r]  zAA TensorFlow safetensors file is being loaded in a PyTorch model.r^  z;A Flax safetensors file is being loaded in a PyTorch model.r_  zTIncompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but r0  r;  zWill use torch_dtype=z$ as defined in model's config objectr   zSince the `torch_dtype` attribute can't be found in model's config object, will use torch_dtype={torch_dtype} as derived from model's weightszh`torch_dtype` can be one of: `torch.dtype`, `"auto"` or a string of a valid `torch.dtype`, but received use_keep_in_fp32_modulesall_checkpoint_keys)r   r  r  r  )r  r  r  )rB  r  r  c                 h    i | ]-\  }t          fd D                        t          j        .S )c              3       K   | ]}|v V  	d S r   rz   )r   r  r   s     r|   r  z=PreTrainedModel.from_pretrained.<locals>.<dictcomp>.<genexpr>"  s'      CC19CCCCCCr{   )r  rs   r   )r   r  r   r  s     @r|   r  z3PreTrainedModel.from_pretrained.<locals>.<dictcomp>  sV       aCCCC.BCCCCC%-  r{   ziIf passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or 'sequential'.no_split_module_classesspecial_dtypeszThis model has some weights that should be kept in higher precision, you need to upgrade `accelerate` to properly deal with them (`pip install --upgrade accelerate`).r>  r=  )r   low_zeror+  )r  i)$load_tf2_checkpoint_in_pytorch_model)allow_missing_keysr$  zLoading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.)%load_flax_checkpoint_in_pytorch_modelzLoading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation instructions.)r  sharded_metadatar)  r*  r  r  r,  r   r  r  	gguf_pathr0  z\The user-defined `generation_config` will be used to override the default generation config.)
r  r  r"  r#  r  r  r  r0  r(  r'  zZGeneration config file not found, using a generation config created from the model config.)r  offload_dirr  r-  	skip_keysforce_hooksr2  r  )r3  r  r2  )rN  rO  mismatched_keysr  rz   )r  r}   r8  r9  r:  r   rJ   r   r  rC   r  r   r>   r2   rA   r   rH   r   rw   r4  r6  r7  r8  r9  rs   r   r   r;  r   r!   r  r/   r   rW   	from_dictrF   r  config_classfrom_pretrainedr  r  r  r+   merge_quantization_configsr  from_configvalidate_environmentupdate_torch_dtypeupdate_device_mapr  r  isdirr5  r8   r7   r4   r  r6   r5   r:   r9   EnvironmentErrorrI   r@   r.   rB   r   r  rg  modeling_gguf_pytorch_utilsrK  rR   rc  rl   r
  ry   r=  r  r  r   r  r  r  r   r   r  r   r  r  r  r    r  r^   r  r;   rK  preprocess_model_pre_quantization_dtypeupdateget_special_dtypes_updater  adjust_target_dtyper  r  r  r]   r   r   rc   rd   adjust_max_memoryr  rb   r`   load_tf_weightsmodeling_tf_pytorch_utilsrU  r  modeling_flax_pytorch_utilsrW  r  _load_pretrained_modelevalr  r  to_dictr   rd  r\   _skip_keys_device_placementrX   r  
FBGEMM_FP8r  r   postprocess_modelr  load_adapter)Yr  r  rK  r  r  r  r  r  r  r  r0  
model_argsr   r   r   r!  r"  r#  r$  r  r%  r  from_pipelinerD  r)  r  r*  r  r+  r  r,  r-  r.  r/  r  r0  commit_hashr	  r2  r3  r  r  r5  rY  resolved_config_filer9  rL  r@  from_ptrH  config_pathmodel_kwargskwarg_attn_imprF  r  rW  r  rX  loading_inforN  is_localarchive_filer  resolved_archive_filecached_file_kwargssafe_weights_namehas_file_kwargsrk  rK  r
  r  one_state_dictr  init_contextsr  rB  rS  target_dtypeno_split_modulesdevice_map_kwargsr  rU  rW  rN  rO  r]  r  r  r  sY                                                                                           @r|   r`  zPreTrainedModel.from_pretrainedj  s"#   L ZZd33
**Y..JJ{E22	 **%6==**Y--$jj)>FF$4d;;"JJ':DAAJJx&&

#3T:: **\599ZZd33
jj55"JJ':DAAZZd33
ZZd33
$4d;;#ZZ(<eDD **%6>>zz.%88zz.%88$jj)>EEJJ{B//	jj66**Y--$4b99zz.)<< &

+BE J J"JJ':DAAJJ{D11		 	% $%M E     l   #E!;~@]@]&+N7#"+C+E+E"#O$$NN  
  )@)B)B hiiif&677 D'21'#1$3#%5%'5::?<A( ( ($ 22FTT%fndCC 	'"0"4"45JD"Q"Q"*&>1	''#1$3#%5!,	' 	' %	' 	'# #.27>>BU3V3V.-sWEEE \*G'48IaLLAZ4[1\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ #' j%,// 	.j)JJ
C(( 	.Z?s-s-s %,z":":;

    bT^b b b  
 
C(( 	.A~~  E   !*-
! ($(!!& c !abbb 	)++  v   -.. ! R  xN  R  R  R  
  	< 	". G   utFLLNNtttKe[e,XdeeeK*<*F +'d+ +FL+ +' NNh  
 *+#*Wfgg
$+8J'( 	$%5 	$KKEFFF# &"233 	"$*$6&&<YK#C3#3#C$#%)- /!1!#*,$ $ $ $ FLL. ]6**F#ZZ(=tDDN).<+!L(=tDDDP 		 /; A-<-W.0C. .** .A**6v7QanoooLLL#--'IZd .    '99+FFK%77
CCJ #/"B"O"UJw !($(!lmmm#4/ 
  $#(  \%= U   )49J,/0M,N,N)w}}%BCCH p Krw~~GLL!>	?]eKeff    K $&7<<0MyZiltZt#u#uLL FGLL!>	K[\\" " F $&7<<0MyZj#k#kLL A27>>GLL!>	K\]]$ $ A $&7<<0MyZk#l#lLL$E11bgnnGLL!>	<XikrKsKstt7 71 $&7<<5y,O`biBjBj$ $LL %E11bgnnGLL5y,OfhoBpBp 7 71 $&7<<5y,OfhoBpBp$ $L "&JJ( +RW^^GLL!>	<XdfmKnKnoo. . + $&7<<5y,|]dBeBe$ $LL ) $RW^^GLL!>	<XjlsKtKtuu. . $ $&7<<5y,OacjBkBk$ $L "&JJ( GNN27<<0MyZiltZt#u#uvvw~~bgll3PR[]m&n&noo +Q|L'/R/R Q Q9Q Q Q  
 ) RW^^GLL!>	K\]]. .  +B|L'/R/R B B9B B B  
 % 
*=|<Mw/W/W = =9= = =  
 +=|L'/R/R = =Vbctv}V~V~ = =,= =0?(0J= =Pa= =9= = =  
 Y8U V VWW c<Y8UX`8` a abb ` $H?\_g?g H H H    "w||I7TW_7_``<== X8(45R(S(S%%  C/HH C0HH$E11+,=wGGHH+L'BBHJ &/*8#*+:,<!&&0$,%.<AAF(3* *& -88UW_,v,vcu,v,v) -4\RcelEmEm9m9m0;9()@'JJ1 1 11 1-
 1<)-JJ, '611N]$AO" O"EWO" O" K 5x >F.z:4<&6'D %v %v(45F(P(P%v %vVbcz  }D  WE  WE%v %v %v'" '" !"  = (4L''J'JH4? =x5 5K]5 51 -4\R^`gEhEh9h9h0;9();WEE1 1 11 1-
 1<)-J+ C"O4E4E C"0<'L:L+MMM PZ4p4K4K_p 108/6-21:8H3" 3" 2;6D7F8H2<1:HMMR4?6" 6" '66" 2 (00MO`'t'tds't't !.$*/>.K-M0QSW/n[m/n-D	%& %& %&
 ',eggg
 -5+2).-64D/ /O  ((EGWkk[jkk "&6'D %a %a(4\7(K(K%a %a %a'" '" !"
 "**GIZ!n!n^m!n!n "&6'D %_ %_(4\7(K(K%_ %_ %_'" '" !"
 ")!4 =|: :O^: :!4 '7'D %m %m(4\7(K(K%m %m(/%m %m %m'" '" !" '7'D %f %f(4\7(K(K%f %fO[\movOwOw%f %f(8%f %f<K%f %fQb%f %f %f'" '" !" (        *Z5R Z Z=ZZ Z ?K<Y`>a>aZ Z -	Z Z 1@	Z Z FW	Z Z Z    fBLBBCCC(4%%dHddMbddeeee 	)IIIIII w~~i(( h%		
 "+&4&'6(8"", (!*8==B$/& &" ((EyggTfgg	--iMMMiXJ$(!JJ$(!  	6P-%#- /!1%!#(7 7 73!#3  %&&	0#66	 &..~>>	
 0DAAA (Q::<<( ( ( ( ( ( ( ( ( ( ( ( ( ( ( ||H%%--h''4//_````h''611 	YZZZZh''500  Dkskwkw  yA  lB  lB  D  D   *+  5	" _*"4,-BQ]^^^
 J&k3// "f,,"6=99 f>P>\*0*<K"KK(q(q(q(qrrrr) 3g9I.I.I.>w.G%/ 3.B:.N.N1@AVWXAYht1u1u1u.B>.R.R$2"KK!e    !44 &-e[&A&A( U  HS  U  U   !99+FF
 ),(A(M (-c',Hb2c2c %  A)9:O)P&&)-joo.?.?)@)@& &7 <T YpYrYr  "
; )<<<=%'' 
	7 
	7KKZ[[[&^00EUEWEW0XXY\iiMM 	7*,, ! R  xN  R  R  R     !3!5!5666v&&v=uEE 	55.CQ\is 6  F ]++ 	= 	=C<<<<|<<E	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	=
  $ 	&&(( )1K1M1M )$(!#(#>  #% #))
I] *    .9F*j#&& ;	JN'%%l&L&LUT_&`&`aaa!!   #(#9#9#;#;     'L'+??MM$:::FF!UUU $  
 ";<L M7#45J#K#K#VVV6D!"233^$$q((d   \))0&(,<<)	 
 ( 

 ,J77
');;JGG
.8l+ .u^^L^L]^^J'11Z1HHH#.u55K0jIII =	$--h77 ++E6;PQTRTQT;UVV______*N*N4cg+ + +'E<< #   LL)  
   *	
^^^^^^==eEZ[[   2  
   	%'
333 **&%-(?!1%"3%-#5!)%9#)# +  , 	 	

  	$5$AKKvwww&+&=&G&GHYHaHaHcHc&d&dE##!! 	&C&O*:*J1+'#1$3#%5%'.#0+ + + +''    p   	 !(-!.#2	! ! g/??JJJ161R!+. !2>!B!B!MMM , 4AEWE[[[37!-0( 4AEWEbbbz400 cj//1111Vz?P?P?R?R5R5R7;!"34"$$ ;-G-I-I ;u::(9:::#**5111!-E*#)-	      	'#$0'6'6",	    ,&&s   :P##P'*P'*R RJ=AA? A?ACBAACCACG0AHHAHHAHVAVVAV!V$AV!_8A` `&A`7`=Aa a&Aa6e"Af f$Af)f(Af)c                 L   =>?@ABCDEFGHIJK d}|d u}d }d }|
d|
                                 v rjt          |t          t          f          r|d         n|??                    d          }||st          d          |t          j        |d           |d}|o|d u}|                                 |	                                }t          |
                                          }|j        I||                    |||          }d =|}=fd	|D             }t          I          dk    r7t          Ifd
|D                       }t          Ifd|D                       }nd}d}| o|}|o| }|r"I d>>fd|D             A>fd|D             }n|rIfd|D             }t          t!          |          t!          |          z
            }t!          |          t!          |          z
  }d |                                D             } |r>fd| D             } n|rIfd| D             } t          || z
            }|                                 |
t%                      st'                      st)          j        t                    }!|	                                                                D ]/\  F}"t/          |"          }#|!|#                             F           0d |!                                D             }$nt3          |          }$|$D ]rC|r>fdCD             Cn|rIfdCD             CCfd|D             Et          E          dk    r.t          E          t          C          k     rEfd|D             }s j         j        D ]HHfd|D             } j         j        D ]HHfd|D             }||                    ||I          }|	r|D ]̊DDt          |
                                          v rDDnƉI dD t          |
                                          v rI dD DnD                    I          rd                    D                    d          dd                    t          |
                                          v r0d                    D                    d          dd                    D|D         }%|}&|7|t@          j!        k    r't          Dfd|D                       rt@          j"        }&|%j#        tA          j#        d          k    r}tA          j$        |%%                                d|&i}'|r*tM          |dd          s|'                    ||'Di           stQ          |Dd |'           |)                    ||'Dd ||           |r|s|rIfd!|D             }(n|rIfd"|D             }(n|}(tU          ||(          })tW          |j,        d#          r@|j,        j-        r4|.                                }*|*tW          |*d$          r|*j/        d|*_0        n!tc          |2                                          })t'                      r|sdd l3}+t          t!          th          j5        6                    d% |)                                 D                                           },|+j7        8                    |,d&          5  |9                    |j:                   d d d            n# 1 swxY w Y   n|9                    |j:                   |^|;                                D ]I\  F}%t          Ffd'|D                       r)|%j<        =                    t@          j"                  |%_<        Jd(J|}-t           j                  dk    r!tW          | j                  s|r
 j        dz   Jt           j                  dk    rtW          | j                  r|stM          | j                  }-t          |-	                                
                                          @t          @Afd)|D                       rt          d*          |
  fd+|
                                D             }
Ifd,}.|Ut          j>        j?                            |d                             t          j>        j?                  d d-                   Bnd B|
|rt          |
|J          G|#t          |          B                    d.d(          nd/K|5t          |t          t          f          r|d         n|??fd0|D             }/n&Bfd1|d2                                         D             }/GJKfd3|/                                D             }0nd }0|Z |.||||||          }1|r#t          |-|J||
||0|||||||4          \  }2}0}nAt          |-|J          }3t          |-|J|3          }2nt          |t                    s|g}g }2g }1|s|
d|
                                 v ri nd }0|rt          jG                    }i }nd }d }|r!t          |
|J5          }4Bfd6|4D             }4ng }4t          |          dk    rt          jJ        |d78          }d }3|D ]}5|5|4v rd }6|
b|`|jK        jL        t          jN        k    rF|jK        jO        d9k    r6tA          j#        d: |
                                 D             d                   }6t          |5||6|;          }|1 |.||||||          z  }1|	rt%                      rt                      s|s}|-	                                                                D ]U\  D}%|%j#        tA          j#        d          k    r3tQ          |-Dd tA          j$        |%%                                d|i           VnOt          |-|J||
||0|||||||4          \  }7}0}|2|7z  }2n(|3t          |-|J          }3|2t          |-|J|3          z  }2~t          jS                     |0t          |0          dk    r||-k    r j        I|s_|0D ]\}8t          jU        t          j>                            ||8 d<          t          j>                            |I d|8 d<                     ]Ifd=|0                                D             }0|st          |0|           d }0|r%t          |-||           t          jX        |           t          |2          dk    r=d>                    |2          }9d?|9v r|9d@z  }9t          dA|jZ        j[         dB|9           t          |          dk    rz|j,        j\        g n|j,        j\        }:|jZ        j[        |:v rt          j^        nt          j_        }; |;dC| dD|jZ        j[         dE| dF|jZ        j[         dG|jZ        j[         dH           n(t          _                    dI|jZ        j[         dJ           t          |          dk    r/t          ^                    dK|jZ        j[         dL| dM| dN           nKt          |1          dk    r8t          _                    dO|jZ        j[         dP| dQ|jZ        j[         dR           t          |1          dk    rMdS                    dT |1D                       }<t          ^                    dK|jZ        j[         dL| dU|< dN           ||||1|0|2fS )VNFr  r   r  zThe current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.Tr  c                 ~   d| v r|                      dd          S d| v r|                      dd          S t          t          j        j        d          r5d| v r|                      dd          S d| v r|                      dd	          S n4d| v r|                      dd          S d	| v r|                      d	d          S | S )
Nr  r  r  r  r  r  r  r  r  )r  r  r   r  r  )r   s    r|   _fix_keyz8PreTrainedModel._load_pretrained_model.<locals>._fix_key  s    }}{{66222#~~{{7H555 rx0-@@ 	X$$;;z3VWWW$$;;z3VWWW % 7#==;;'JJWWW6#==;;'JJWWWJr{   c                 &    g | ]} |          S rz   rz   )r   r   r  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>3  s!    <<<xx}}<<<r{   c              3   B   K   | ]}|                               V  d S r   r   r   sr  s     r|   r  z9PreTrainedModel._load_pretrained_model.<locals>.<genexpr>6  s/      #N#NQALL$8$8#N#N#N#N#N#Nr{   c              3   B   K   | ]}|                               V  d S r   r   r  s     r|   r  z9PreTrainedModel._load_pretrained_model.<locals>.<genexpr>7  s/      'T'TV(<(<'T'T'T'T'T'Tr{   r   c                 >    g | ]}|                               |S rz   r   r   r  _prefixs     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>C  s+    )`)`)`!,,W^J_J_)`!)`)`)`r{   c                 l    g | ]0}|                               r|t                    d          n|1S r   r   r   r  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>D  s?    fffST!,,w2G2GNQs7||~~..Qfffr{   c                 >    g | ]}d                      |g          S r  r5  r  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>F  s)    JJJqSXXvqk22JJJr{   c                     h | ]\  }}|S rz   rz   )r   r=  r  s      r|   rp  z9PreTrainedModel._load_pretrained_model.<locals>.<setcomp>M  s    ===tq!===r{   c                 l    h | ]0}|                               r|t                    d          n|1S r   r  r   r   r  s     r|   rp  z9PreTrainedModel._load_pretrained_model.<locals>.<setcomp>O  s@    nnnY\CNN74K4KTSW00QTnnnr{   c                 >    h | ]}d                      |g          S r  r  r  s     r|   rp  z9PreTrainedModel._load_pretrained_model.<locals>.<setcomp>Q  s)    NNNSXXvsm44NNNr{   c                 >    g | ]\  }}t          |          d k    |S r  r  )r   r  r  s      r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>\  s'    PPPXQUa5r{   c                 l    g | ]0}|                               r|t                    d          n|1S r   r  r  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>c  s?    bbbUXw0G0GPS\\^^,,Sbbbr{   c                 >    g | ]}d                      |g          S r  r  r  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>e  s)    BBBS63-00BBBr{   c                     g | ]}|v |	S rz   rz   )r   r   groups     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>f  s    FFFa1:::::r{   c                     g | ]}|v|	S rz   rz   )r   r   missing_in_groups     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>h  s$    UUUa1DT;T;T;T;T;Tr{   c                 >    g | ]}t          j        |          |S r   r  r   r   r  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>n  s*    UUUa29S!;L;L;T;T;T;Tr{   c                 >    g | ]}t          j        |          |S r   r  r  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>r  s+    "["["[3PQARARAZ1AZAZAZr{   r   c              3   F   K   | ]}|                     d           v V  dS r  r  )r   r  r   s     r|   r  z9PreTrainedModel._load_pretrained_model.<locals>.<genexpr>  sD        E[.#))C..@     r{   r  r   r  )r  r  r   r2  c                     g | ]	} d | 
S r  rz   r  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>  s#    #I#I#IvOOOO#I#I#Ir{   c                 D    g | ]}|t                    d z   d         S )r   Nr  r  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>  s.    #N#N#NQAc&kkAo&7&7$8#N#N#Nr{   r  r  c              3   B   K   | ]}|                     d           V  dS )F)r  N)r   )r   r  s     r|   r  z9PreTrainedModel._load_pretrained_model.<locals>.<genexpr>  sD       6 6DMI000??6 6 6 6 6 6r{   r  c              3   F   K   | ]}|                     d           v V  dS r  r  )r   r  r   s     r|   r  z9PreTrainedModel._load_pretrained_model.<locals>.<genexpr>  s4      ttE[-C@ttttttr{   r   c              3   (   K   | ]}|v o|vV  d S r   rz   )r   r   base_model_expected_keysexpected_keys_not_prefixeds     r|   r  z9PreTrainedModel._load_pretrained_model.<locals>.<genexpr>  s5      ttad344\D\9\ttttttr{   zjThe state dictionary of the model you are trying to load is corrupted. Are you sure it was properly saved?c                 T    i | ]$\  }}|                     j         d d          |%S rn  )r  r  )r   r   r   r  s      r|   r  z:PreTrainedModel._load_pretrained_model.<locals>.<dictcomp>  s<    kkkPTPQSTaii3+@(C(C(CRHH!kkkr{   c                    g }|r|D ]}|| vr|}|r	 d| }n2|r0d                     |                    d          dd                    }||v r| |         j        ||         j        k    r| |         j        d         dk    r:| |                                         dz  ||                                         k    r|                    || |         j        ||         j        f           | |= |S )Nr   r   r   r  )r5  r  r=  r  r  )
r   model_state_dictr&  add_prefix_to_modelremove_prefix_from_modelr  r]  checkpoint_key	model_keyr  s
            r|   _find_mismatched_keyszEPreTrainedModel._load_pretrained_model.<locals>._find_mismatched_keys  sF    !O& ;&1 ; ;N%Z77  .I/ L'-$@$@$@$@		, L$'HH^-A-A#-F-Fqrr-J$K$K	 "%555&~6<@PQZ@[@aaa '~6<R@AEE *> : @ @ B BQ FJZ[dJeJkJkJmJm m m !+22!/N1K1QScdmSnSt u   !+> :""r{   r   ztorch.r   c                     i | ]}|S rz   rz   )r   pr  s     r|   r  z:PreTrainedModel._load_pretrained_model.<locals>.<dictcomp>	  s    LLL!aLLLr{   c                 X    i | ]&\  }}|t           j                            |          'S rz   rw   r4  r5  )r   r  rL  rC  s      r|   r  z:PreTrainedModel._load_pretrained_model.<locals>.<dictcomp>  s/    lllTQafa!8!8lllr{   r  c                     i | ]W\  }}|                               r=|t                    d                   dk    ;|t                    d          ||dXS )Nr  )safetensors_fileweight_namer   r  )r   r  rL  param_device_mapr   	str_dtypes      r|   r  z:PreTrainedModel._load_pretrained_model.<locals>.<dictcomp>  s       Aq<<-- 3C1SEVEVEXEXCY2Z^d2d2d #l##%%&Qq[d(e(e2d2d2dr{   )
r  r  r  r  r  r   r  r  r  rO  )rX  r   c                 P    g | ]"}t           j                            |          #S rz   r  )r   rL  rC  s     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>N  s)    (`(`(`Qfa)@)@(`(`(`r{   zLoading checkpoint shardsr  int4_weight_onlyc                     g | ]}|d v|	S ))r2  r  rz   )r   ds     r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>`  s$    0l0l0lqST\kSkSkSkSkSkr{   )rW  r1  r0  z.datc                 &    i | ]\  }} d | |S r  rz   )r   r   r  r  s      r|   r  z:PreTrainedModel._load_pretrained_model.<locals>.<dictcomp>  s-    $f$f$f*#u%6%6%6%6$f$f$fr{   z
	zsize mismatchz_
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.r)  z:
	z(Some weights of the model checkpoint at z! were not used when initializing z: z,
- This IS expected if you are initializing z from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing z from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).z9All model checkpoint weights were used when initializing z.
zSome weights of z3 were not initialized from the model checkpoint at z and are newly initialized: zo
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.zAll the weights of z/ were initialized from the model checkpoint at zf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use z* for predictions without further training.r  c           	      .    g | ]\  }}}d | d| d| dS )z- z: found shape z in the checkpoint and z in the model instantiatedrz   )r   r   shape1shape2s       r|   r   z:PreTrainedModel._load_pretrained_model.<locals>.<listcomp>  sJ       +VV nmmFmm6mmm  r{   z= and are newly initialized because the shapes did not match:
)`r   r  r   r   rc  r   rw   r  r  r   r   r  update_expected_keysr   r  r  r:  named_buffersr}   r!   r  r  r   r&   r  rb   _keys_to_ignore_on_load_missing"_keys_to_ignore_on_load_unexpectedupdate_missing_keysr   r5  r  rs   r  r   r   r  r6  r   r  rh   r  rv  r  rK  r  r  r  rs  r  rq  r  	itertoolschainfrom_iterabler  r  rp  r  r  r  r  r4  r  expand_device_mapr   r  r  r   r  tempfilemkdtempget_disk_only_shard_filesrM   r  r  r  rX   TORCHAO
quant_typer=  r   r>  r?  shutilmoverg   re   rmtreer;  r   r   r  r   r  r  )Lr  rB  r   r&  r  r  r  rX  r)  r*  r  r  r,  r   r  r  rY  r0  r  rW  r  r  is_sharded_safetensorsr  r  original_loaded_keyshas_prefix_moduleexpects_prefix_moduler  r  rN  rO  model_buffersr  rw  	id_tensorr  r  r  r  _loaded_keysru  r  r  not_initialized_parametersr   r  r  r  r]  r  r  disk_only_shard_filesr  r1  new_error_msgsr  	error_msgarchswarnermismatched_warningr  r  r  r  r  rC  r  r   r  r   r  r  r  r   r  sL   `                                                            @@@@@@@@@@@@@@@r|   rr  z&PreTrainedModel._load_pretrained_model  s   * #4/ !f
0A0A0C0C&C&C,67LtUZm,\,\w%a((bw  *22>BBN%n% :  
 )NT::::!)%)"!/!P4DD4P 	 !++---224455(#(==e]T_``M	 	 	&  +<<<<<<<v;;?? ##N#N#N#N+#N#N#N N N$''T'T'T'Tm'T'T'T$T$T!! %$)! (9#8#R=R /M8M4M# 	KlllG)`)`)`)`])`)`)`&ffffXefffMM  	KJJJJMJJJMc-003{3C3CCDDk**S-?-?? >=u':':'<'<===# 	Onnnn`mnnnMM  	ONNNNNNNM =!@AAo&7&7@Z@\@\*400D % 0 0 2 2 8 8 : : - -f-f55	Y&&t,,,, QPPPPKK /u55K  	V 	VE' Cbbbb\abbb$ CBBBBEBBBFFFF<FFF#$$q((S1A-B-BSZZ-O-OUUUU<UUU .:: V VUUUU<UUU1== \ \"["["["[o"["["[#';;E<QWXXL   	s# s s$/44667777CC&&&&$/?/D/D/F/F*G*GGG#++c++CC^^F++ 739K0L0LPTUeUjUjUlUlPmPm0m0m((399S>>!""#566C(-  %(4..    _s     /
 $)=L<5<#7#777!KJ\JJE(	s#L2TV[\\	s  ,AA!uQS  B    	s 4E3uMMMM$;;E5#uV`bqrrr   	7* I+ /#I#I#I#I[#I#I#ILL( /#N#N#N#N+#N#N#NLL#.L-G|-\-\*5<)>?? HELDd H(-(C(C(E(E%(4&'8&AA HEVE[EcCG-@-1%2E2E2G2G-H-H*)++ 7L 7    -1!55 6 6QkQrQrQtQt6 6 6   . .* ^667Qab6cc ; ;KK 9:::; ; ; ; ; ; ; ; ; ; ; ; ; ; ; E5666  +$5577 > >etttt_sttttt >!&u}!=!=EJ s$%%))'%AV2W2W)\m)036Ls$%%))geS=R.S.S)\m)#E3+@AAM'+M,D,D,F,F,K,K,M,M'N'N$ttttthsttttt  &   %kkkkXbXhXhXjXjkkk
&	# &	# &	# &	# &	#P !,W[%%&;A&>&D&DRW[&Q&QRUSURU&VWWFFF!n!0=QS_``<A<ME

**8R888S\I' ""7$GG/)!,,. 
 MLLL7KLLL

llllEUVbEcEiEiEkEklll
     &,,..  MM !M!33 $#(' O  >^! !)#1"/&7%5!-#1)=$3? ? ?;
M+;+;$ ,Q!:|, ,( 9!:|=U 

 3T:: @)>(?%J O! i&0&<:K\K\K^K^A^A^dh! ($,$4$6$6!#%  $(!#' % +(A1AP\) ) )% )a(`(`(`J_(`(`(`%%(*%())A--(/5JQl(m(m(m%'+$3 ? ?
!666#*$0$8EI[Iccc$8CGYYY#(<0l0lJ<M<M<O<O0l0l0lmn0o#p#pL,\co  
  #8#8$(',+$ $  % !&(( 51E1G1G 5P\ 5*7*B*B*D*D*J*J*L*L " "JC$|u|F/C/CCC ;$13u{EJJLL?f`e?f?f!" !" !"" Kk)&()'1+9*7.?-="')5+91E,;K K KG7G  #n4

 073X):|4 40 "=%z<AY# # J
 
(S-?-?!-C-CM)) 2F) +8  K"K "^=Q=Q=Q R R "^=Z=Z=Z=Z=Z [ [    %g$f$f$fP]PcPcPePe$f$f$fM% )&}nEEE$(M! 1&}6FHYZZZ/000z??QJ//I)++w	 oU_E]oodmooppp!##,4<BB%,B\E','?5'H'HV^^fkFFt;X t t!&!9t t=Lt t!&!9t t
 O,t t t    KKqTYTcTlqqqrrr|q  NNn5?#; n n1n nO[n n n   
 !!Q&&KKeo&>  1 8=8P     !##!% />  " " NN<5?#; < <1< <.< < <   lO_mU___s   *\\\c                    d |D             }|                     d |D                       }g }|                                 D ]\  }}|r9| j         d}|                    |          r|t	          |          d          n|}n8|r6t	          |          dk    rd                    | j        |g          n| j        }||v r|                    |           |S )Nc                 n    h | ]2}d                      |                    d           dd                   3S )r   Nr   )r5  r  r   r   s     r|   rp  z>PreTrainedModel.retrieve_modules_from_names.<locals>.<setcomp>  s7    FFFsxx		#ss 344FFFr{   c                     h | ]_}t          |          d k    |d                                         /d                    |                    d          dd                   `S )r   r   r   N)r   r	  r5  r  r  s     r|   rp  z>PreTrainedModel.retrieve_modules_from_names.<locals>.<setcomp>  sY    bbbss3xx!||PSTVPWP_P_PaPa|SXXciinnSbS)**|||r{   r   r   )unionrq  r  r   r   r5  r  )	r#  r  
add_prefixremove_prefixmodule_keysretrieved_modulesr   r   r  s	            r|   retrieve_modules_from_namesz+PreTrainedModel.retrieve_modules_from_names  s	   FFFFF "''bbebbb
 
  ..00 	1 	1LD& m!3666/3w/G/GQtCLLNN++T mCFt99q==sxx!7 >???VZVl{""!((000  r{   c                 v    t          | ||           t          ||          }|}t          | ||||          }	|	S )a  
        This is an experimental function that loads the model using ~1.x model size CPU memory

        Before you call it do:

        1. save which state_dict keys are available
        2. drop state_dict before model is created, since the latter takes 1x model size memory

        Here then we continue:

        3. switch to the meta device all params/buffers that are going to be replaced from the loaded state_dict
        4. load state_dict 2nd time
        5. replace the params/buffers from the state_dict

        Currently, it doesn't handle missing_keys, unexpected_keys, mismatched_keys. It can't handle deepspeed. To
        handle bitsandbytes, needs non-empty hf_quantizer argument.
        rM  )r  r  )r  r=  r  )
rB  r  r  r   r  r  r0  r   r  r  s
             r|   _load_pretrained_model_low_memz.PreTrainedModel._load_pretrained_model_low_mem  sZ    8 	E#9<HHH$%:VVV
.5'%
 
 

 r{   	AutoModelc                     t          |t                    s|j        }ddlmc m} t          ||          st          | d          || _        dS )a  
        Register this class with a given auto class. This should only be used for custom models as the ones in the
        library are already mapped with an auto class.

        <Tip warning={true}>

        This API is experimental and may have some slight breaking changes in the next releases.

        </Tip>

        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoModel"`):
                The auto class to register this new model with.
        r   Nz is not a valid auto class.)	r  r   r   transformers.models.automodelsr;  r  r   r  )r  
auto_classauto_modules      r|   register_for_auto_classz'PreTrainedModel.register_for_auto_class  sn      *c** 	-#,J666666666{J// 	I
GGGHHH$r{   c                     t                      st          d          ddlm} t	          j        |          t	          j        d          k     rt          d| d          ddlm} |                    |           S )a(  
        Converts the model to use [PyTorch's native attention
        implementation](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html), integrated to
        Transformers through [Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview). Only a
        subset of all Transformers models are supported.

        PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested
        tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog
        post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2).

        Returns:
            [`PreTrainedModel`]: The model converted to BetterTransformer.
        <The package `optimum` is required to use Better Transformer.r   r   1.7.0EPlease install optimum>=1.7.0 to use Better Transformer. The version  was found.BetterTransformer)	rG   r  optimum.versionr   r   rf  optimum.bettertransformerr  	transformr#  optimum_versionr  s      r|   to_bettertransformerz$PreTrainedModel.to_bettertransformer6  s     $%% 	^\]]]BBBBBB=))GM',B,BBBtXgttt   	@????? **4000r{   c                     t                      st          d          ddlm} t	          j        |          t	          j        d          k     rt          d| d          ddlm} |                    |           S )a  
        Reverts the transformation from [`~PreTrainedModel.to_bettertransformer`] so that the original modeling is
        used, for example in order to save the model.

        Returns:
            [`PreTrainedModel`]: The model converted back to the original modeling.
        r  r   r   r   r  r  r  )	rG   r  r  r   r   rf  r  r  reverser  s      r|   reverse_bettertransformerz)PreTrainedModel.reverse_bettertransformerR  s     $%% 	^\]]]BBBBBB=))GM',B,BBBtXgttt   	@????? ((...r{   c           
      h   t          |          s,t          j                                        st	                      rdS || j        j        dS | j        j        |ddddgf         v rd}| j        j        | j        j        | j        j        k    sL| j        j        | j        j        | j        j        k    s&| j        j	        T| j        j	        | j        j        k    r:|d| j        j         d| j        j         d| j        j         d| j        j	         d		z  }t                              |           dS dS )
zv
        Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given.
        Nr   r   zWe strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.z5
You may ignore this warning if your `pad_token_id` (z&) is identical to the `bos_token_id` (z), `eos_token_id` (z), or the `sep_token_id` (z ), and your input is not padded.)rU   rs   jit
is_tracingrV   rK  pad_token_idbos_token_ideos_token_idsep_token_idr   r  )r#  r  rA  warn_strings       r|   %warn_if_padding_and_no_attention_maskz5PreTrainedModel.warn_if_padding_and_no_attention_maskh  sn    Y'' 	59+?+?+A+A 	E]E_E_ 	F&DK,D,LF ;#yRG'<<<F  )5$+:RVZVaVn:n:nK,8T[=UY]YdYq=q=qK,8T[=UY]YdYq=q=qiT[Me i i.2k.Fi i[_[f[si i.2k.Fi i i ,,,,,) =<r{   c                 r    t          j        dt                     t          | d          sdS | j        j        S )Nz`_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` insteadr  F)r8  r9  r:  r  r  is_trainabler'  s    r|   _is_quantized_training_enabledz.PreTrainedModel._is_quantized_training_enabled  sD     V	
 	
 	

 t^,, 	5 --r{   c                    t          | j        dd           | j        j        }np| j        j        }|t
          vr[dd                    t
                     d}t          j        || j        j                  }t          |          dk    r	|d         }nd }||t
          vr6t          | j        dd            t                              d| d           d}t
          |         S )	N	loss_type(|rJ  r   z`loss_type=zY` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.ForCausalLM)r   rK  r  r   r   r"   r5  r   findallr   r   r  )r#  r  loss_groupss      r|   loss_functionzPreTrainedModel.loss_function  s     4;T22>-II/I,,;#((<"8"8;;;J{DN4KLL	y>>A%% )!II $I	 = ='$+WbdhBiBiBu>i > > >   &II&&r{   )FNNT)NNTFru  )NNTr  )NFTr   rw  )FNTFNNNNNNNTrv  )r   NNT)r  )r   r~  )or   rx  ry  rz  r_  r  rq  r  r  r  ru  r  r  r  r  r  is_parallelizabler  _is_statefulr  r  _supports_cache_class_supports_static_cache_supports_quantized_cacher|  r   r   rs   r   r  r[  r   r  r  r  r  r   r   r  classmethodr  r   r   r   r   r  r  r   r   r  r  r  r  r  r  r  r  r  r   r  r  r{  r  r  r  r]  r'  r#  r*  rA  r+  r8  rB  rC  rD  rj  r   rl  r  ro  r  r   r
   r  r  r  r  rw   PathLiker  r  r   r=   r  r  r  r  r  r   r`  rr  r  r  r  r
  r  r  r  r   r!  __classcell__r   s   @r|   r~  r~  4  s        6 L!OJK"&  '+# *.& #&+#L # N "" !&9d3#45 9 9 9 X9 3    XU/ U U U U U U.> > >
- 
- 
-; ; ;,5cC#8 ,T , , , ,@ 5 5 [5n  ',-1;?!%] ]  $] ek*	]
 U3S#X#678] ] ] ] []~ U[ U[    [4 ;BI ; ; ; X; $T $ $ $ [$L  .2;?!% %[ [ ek*[ U3S#X#678	[
 [ [ 
[ [ [ [[z  T N^    [:	p 	p 	p* * *&bi & & & &&") & & & &ry      ) ) )& & &6 WW%'YWCFW[^W W W \WrM M M(' ' ' ' '> )-,0"	9 9 9 %SM9 	9
 
9 9 9 9v%+ %+ %+ %+T )-,0"\ \\ !\ %SM	\
 \ 
\ \ \ \B )-%*"~ ~Y~ !~ TN	~
 ~ 
~ ~ ~ ~@  D @ @ @ @0d d d
d d d
c 
 
 
 

r|U2<=P/P)Q 
 
 
 
  "5$sDI~*> 5 5 5 5"(. (. (. (.T :>gq  $ \d    ,/ / /. n4 n n n Xn !%%)"'*!*/#'!%,0!%C Cc2;./C C TN	C
  C C c3hC !C #C c4i()C C C C CJ U>%&&4 4 4 4 '&4   $ U58?  1 1 1 1 ! 1$ U58?%+ %+ %+ %+ %+N' ' ' ' '( ( ( ( ( 
 GK7;(-$!&,0 $!{ { {'/c2;6F0G'H{ /bkABC	{
 E#r{"234{ "&{ { { c4i(){ { { { 
{ { { [{z#  !&!%r` r` r` [r`h! ! ! !. 
 &*% % % \%N % % % [%21 1 1 18/ / /,!- !- !-F 	. 	. X	. ' ' Y X' ' ' ' 'r{   r~  rB  r  z
model file)objectobject_classobject_filesc                   j     e Zd ZdZdef fdZ	 d	dej        deej                 dej        fdZ	 xZ
S )
PoolerStartLogitsz
    Compute SQuAD start logits from sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    rK  c                     t                                                       t          j        |j        d          | _        d S )Nr   )r  r  r   rA  hidden_sizedenser#  rK  r   s     r|   r  zPoolerStartLogits.__init__  s3    Yv1155


r{   Nhidden_statesp_maskr   c                     |                      |                              d          }|:t          |           t          j        k    r|d|z
  z  d|z  z
  }n|d|z
  z  d|z  z
  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        Returns:
            `torch.FloatTensor`: The start logits for SQuAD.
        r   Nr     ꌠ9Y>)F)r2  squeezer   rs   r  )r#  r4  r5  xs       r|   forwardzPoolerStartLogits.forward  ss     JJ}%%--b11"4((EM99V$uv~5V$tf}4r{   r   )r   rx  ry  rz  r   r  rs   FloatTensorr   r;  r)  r*  s   @r|   r/  r/    s         6/ 6 6 6 6 6 6
 W[ ".8@AR8S		       r{   r/  c                        e Zd ZdZdef fdZ	 	 	 ddej        deej                 deej	                 deej                 d	ej        f
d
Z
 xZS )PoolerEndLogitsz
    Compute SQuAD end logits from sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    rK  c                 N   t                                                       t          j        |j        dz  |j                  | _        t          j                    | _        t          j        |j        |j	                  | _        t          j        |j        d          | _
        d S )Nr  )epsr   )r  r  r   rA  r1  dense_0Tanh
activation	LayerNormlayer_norm_epsdense_1r3  s     r|   r  zPoolerEndLogits.__init__  sz    y!3a!79KLL'))f&8f>STTTy!3Q77r{   Nr4  start_statesstart_positionsr5  r   c                 Z   ||
J d            |a|j         dd         \  }}|ddddf                             dd|          }|                    d|          }|                    d|d          }|                     t	          j        ||gd                    }|                     |          }|                     |          }|                     |          	                    d          }|:t          |           t          j        k    r|d|z
  z  d|z  z
  }n|d|z
  z  d|z  z
  }|S )	a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The end logits for SQuAD.
        N7One of start_states, start_positions should be not Noner  r   r.  r   r7  r8  )r=  rW  gatherrA  rs   r>  rC  rD  rF  r9  r   r  )r#  r4  rG  rH  r5  slenhszr:  s           r|   r;  zPoolerEndLogits.forward  sB   < $(C(CD )D(C(C&%+BCC0ID#-aaatm<CCBCPPO(//ODDL'..r4<<LLLM<#@bIIIJJOOANN1LLOO##B''"4((EM99V$uv~5V$tf}4r{   NNNr   rx  ry  rz  r   r  rs   r<  r   
LongTensorr;  r)  r*  s   @r|   r>  r>    s         8/ 8 8 8 8 8 8 596:.21 1(1 u011 "%"23	1
 *+1 
	1 1 1 1 1 1 1 1r{   r>  c                        e Zd ZdZ fdZ	 	 	 d
dej        deej                 deej                 deej                 dej        f
d	Z	 xZ
S )PoolerAnswerClassz
    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model.
    c                    t                                                       t          j        |j        dz  |j                  | _        t          j                    | _        t          j        |j        dd          | _        d S )Nr  r   F)r  )	r  r  r   rA  r1  rA  rB  rC  rF  r3  s     r|   r  zPoolerAnswerClass.__init__)  sc    y!3a!79KLL'))y!3QUCCCr{   Nr4  rG  rH  	cls_indexr   c                 `   |j         d         }||
J d            |K|ddddf                             dd|          }|                    d|                              d          }|L|ddddf                             dd|          }|                    d|                              d          }n|dddddf         }|                     t          j        ||gd                    }|                     |          }|                     |                              d          }|S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                The final hidden states of the model.
            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                The hidden states of the first tokens for the labeled span.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                The position of the first token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.

        <Tip>

        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
        `start_states`.

        </Tip>

        Returns:
            `torch.FloatTensor`: The SQuAD 2.0 answer class.
        r   NrJ  r  rK  )	r=  rW  rL  r9  rA  rs   r>  rC  rF  )r#  r4  rG  rH  rU  rN  cls_token_stater:  s           r|   r;  zPoolerAnswerClass.forward/  s>   : !"%$(C(CD )D(C(C&-aaatm<CCBCPPO(//ODDLLRPPL !!!!T4-077BDDI+222yAAII"MMOO+AAAr111H5OLLL/#BKKKLLOOALLOO##B''r{   rO  )r   rx  ry  rz  r  rs   r<  r   rQ  r;  r)  r*  s   @r|   rS  rS     s         D D D D D 596:04/ /(/ u01/ "%"23	/
 E,-/ 
	/ / / / / / / /r{   rS  c                       e Zd ZU dZdZeej                 ed<   dZ	eej                 ed<   dZ
eej                 ed<   dZeej                 ed<   dZeej                 ed<   dZeej                 ed<   dS )	SquadHeadOutputa  
    Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
            losses.
        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top config.start_n_top start token possibilities (beam-search).
        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
            (beam-search).
        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
            Log probabilities for the `is_impossible` label of the answers.

    Nlossstart_top_log_probsstart_top_indexend_top_log_probsend_top_index
cls_logits)r   rx  ry  rz  rZ  r   rs   r<  __annotations__r[  r\  rQ  r]  r^  r_  rz   r{   r|   rY  rY  a  s          * )-D(5$
%,,,7;%"34;;;26OXe./66659x 1299904M8E,-444.2J*+22222r{   rY  c                       e Zd ZdZ fdZ eee          	 	 	 	 	 	 ddej	        de
ej                 de
ej                 d	e
ej                 d
e
ej                 de
ej	                 dedeeeej	                 f         fd            Z xZS )	SQuADHeadz
    A SQuAD head inspired by XLNet.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
            to use.
    c                     t                                                       |j        | _        |j        | _        t	          |          | _        t          |          | _        t          |          | _	        d S r   )
r  r  start_n_top	end_n_topr/  start_logitsr>  
end_logitsrS  answer_classr3  s     r|   r  zSQuADHead.__init__  sc    !-)-f55)&11-f55r{   )output_typer_  NFr4  rH  end_positionsrU  is_impossibler5  return_dictr   c                    |                      ||          }||||||fD ]1}	|	-|	                                dk    r|	                    d           2|                     |||          }
t	                      } |||          } ||
|          }||z   dz  }|A|?|                     |||          }t          j                    } |||          }||dz  z  }|rt          |	          n|fS |	                                \  }}}t          j
                            |d
          }t          j        || j        d
          \  }}|                    d                              dd|          }t          j        |d|          }|                    d                              d|dd          }|                    d                              |          }||                    d          nd}|                     |||          }
t          j
                            |
d
          }t          j        || j        d
          \  }}|                    d| j        | j        z            }|                    d| j        | j        z            }t          j        d||          }|                     |||          }|s|||||fS t          |||||          S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                Final hidden states of the model on the sequence tokens.
            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Positions of the first token for the labeled span.
            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Positions of the last token for the labeled span.
            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
            is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                Whether the question has a possible answer in the paragraph or not.
            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                should be masked.
            return_dict (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Returns:
        )r5  Nr   r   )rH  r5  r  )rH  rU  g      ?)rZ  rK  r  )rG  r5  z
blh,bl->bh)rG  rU  )r[  r\  r]  r^  r_  )rf  r.  squeeze_rg  r   rh  r   BCEWithLogitsLossrY  r6  r  softmaxrs   topkrd  rS  rW  rL  	expand_asre  rz  einsum)r#  r4  rH  rj  rU  rk  r5  rl  rf  r:  rg  loss_fct
start_lossend_loss
total_lossr_  loss_fct_clscls_lossbszrM  rN  start_log_probsr[  r\  start_top_index_exprG  hidden_states_expandedend_log_probsr]  r^  s                                 r|   r;  zSQuADHead.forward  s7   > ((v(FF&=+D%}iO # #=QUUWWq[[JJrNNN `fggJ'))H!,@@Jx
M::H$x/14J$)B!..}oir.ss
!355'<
MBB hn,
7BU?
3333U +//11NCs m33Lb3IIO38:!1r4 4 40 #2";";B"?"?"F"Fr2s"S"S <r;NOOL'11!44;;Bb"MML%2%<%<Q%?%?%I%I& &" .4-?V%%b)))TF)?lcijjJM11*!1DDM/4zt~10 0 0,} !2 6 6r4;Kdn;\ ] ])..r43Cdn3TUUM <m_UUL**=|_h*iiJ 	+_>OQ^`jkk&(;$3&7"/)   r{   )NNNNNF)r   rx  ry  rz  r  rN   rY  r   rs   r<  r   rQ  r   r   r   r;  r)  r*  s   @r|   rb  rb    s*        6 6 6 6 6 ?IYZZZ 7;480448.2!^ ^(^ "%"23^   01	^
 E,-^   01^ *+^ ^ 
e&7 88	9^ ^ ^ [Z^ ^ ^ ^ ^r{   rb  c                   j     e Zd ZdZdef fdZ	 d	dej        deej	                 dej        fdZ
 xZS )
SequenceSummarya  
    Compute a single vector summary of a sequence hidden states.

    Args:
        config ([`PretrainedConfig`]):
            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
            config class of your model for the default values it uses):

            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:

                - `"last"` -- Take the last token hidden state (like XLNet)
                - `"first"` -- Take the first token hidden state (like Bert)
                - `"mean"` -- Take the mean of all tokens hidden states
                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
                - `"attn"` -- Not implemented now, use multi-head attention

            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
              (otherwise to `config.hidden_size`).
            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
              another string or `None` will add no activation.
            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
    rK  c                 .   t                                                       t          |dd          | _        | j        dk    rt          t                      | _        t          |d          rW|j        rPt          |d          r|j	        r|j
        dk    r|j
        }n|j        }t          j        |j        |          | _        t          |dd           }|rt          |          nt                      | _        t                      | _        t          |d          r)|j        dk    rt          j        |j                  | _        t                      | _        t          |d	          r+|j        dk    r"t          j        |j                  | _        d S d S d S )
Nsummary_typelastattnsummary_use_projsummary_proj_to_labelsr   summary_activationsummary_first_dropoutsummary_last_dropout)r  r  r   r  r  r   summaryr  r  r  
num_labelsr1  r   rA  r   rC  first_dropoutr  Dropoutlast_dropoutr  )r#  rK  num_classesactivation_stringr   s       r|   r  zSequenceSummary.__init__  s   #FNFCC&& &%zz6-.. 	F63J 	Fv788 1V=Z 1_e_pst_t_t$/$09V%7EEDL#F,@$GGIZ$jN3D$E$E$E`h`j`j%ZZ6233 	J8TWX8X8X!#F,H!I!ID$JJ6122 	Hv7RUV7V7V "
6+F G GD	H 	H7V7Vr{   Nr4  rU  r   c                 :   | j         dk    r|dddf         }n-| j         dk    r|dddf         }n| j         dk    r|                    d          }n| j         d	k    r|=t          j        |d
ddddf         |j        d         dz
  t          j                  }nl|                    d                              d          }|                    d|                                dz
  z  |	                    d          fz             }|
                    d|                              d          }n| j         dk    rt          |                     |          }|                     |          }|                     |          }|                     |          }|S )ak  
        Compute a single vector summary of a sequence hidden states.

        Args:
            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                The hidden states of the last layer.
            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.

        Returns:
            `torch.FloatTensor`: The summary of the sequence hidden states.
        r  Nr   firstr   rM  r   rK  rU  .r  r,  )r   r  )r  rM  rs   	full_liker=  longrS  rW  r.  r6  rL  r9  r  r  r  rC  r  )r#  r4  rU  r  s       r|   r;  zSequenceSummary.forward,  s    &&"111b5)FF'))"111a4(FF&(("''A'..FF+-- !O!#rr111*-!'+a/*  		 &//33==bAA	%,,Uimmoo6I-JmN`N`acNdNdMf-fgg	"))"i88@@DDFF&((%%##F++f%%((""6**r{   r   rP  r*  s   @r|   r  r    s         2H/ H H H H H H< Y]) )".);CEDT;U)		) ) ) ) ) ) ) )r{   r  	recursivec                     t                      r4i }|r#t          d          st          d          ||d<   t          | fi |S t          | d          rt	          | j                  S | S )a  
    Recursively unwraps a model from potential containers (as used in distributed training).

    Args:
        model (`torch.nn.Module`): The model to unwrap.
        recursive (`bool`, *optional*, defaults to `False`):
            Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers
            recursively, not just the top-level distributed containers.
    z0.29.0zsSetting `recursive=True` to `unwrap_model` requires `accelerate` v0.29.0. Please upgrade your version of accelerater  r   )rC   r;  ra   r  r  r   )rB  r  r   s      r|   r  r  X  s         	0*844 0" J   '0{#*5;;F;;; 5(## 	---Lr{   c                     i }fd|D             }|                                  D ]'\  |                    fd|D                        (|S )zT
    Expand a device map to return the correspondance parameter name to device.
    c                 h    g | ].}|                               |t                    d          /S r   r  )r   r  r   s     r|   r   z%expand_device_map.<locals>.<listcomp>{  s=    ]]]a!,,|B\B\]1S&&(()]]]r{   c                 ^    i | ])}|k    s|                      d           sdk    &|*S rn  r   )r   r  r   r   s     r|   r  z%expand_device_map.<locals>.<dictcomp>~  sE    iii1qF{{allf<<<>X>X{\bfh\h\hQ\h\h\hr{   )r   rk  )r  param_namesr   new_device_mapr   r   s     ` @@r|   r  r  v  s     N]]]];]]]K$**,, 
 
iiiiiiii	
 	
 	
 	
 r{   c                    fd|d                                          D             }t          j        t                    }|                                 D ]\  }}t	          |          dk    rK|| vrGd                    |                    d          dd                   }t	          |          dk    r|| vG||                             | |                    d |                                 D             S )zT
    Returns the list of shard files containing only weights offloaded to disk.
    c                 p    i | ]2\  }}|                               |t                    d          |3S r   r  )r   r  r   r   s      r|   r  z-get_disk_only_shard_files.<locals>.<dictcomp>  sW       &*aXYXdXdeqXrXr	#l



  r{   r  r   r   Nr   c                 @    g | ]\  }}t          |          d hk    |S )r  )r:  )r   fnamedevicess      r|   r   z-get_disk_only_shard_files.<locals>.<listcomp>  s0    ZZZneWWRXQYAYAYEAYAYAYr{   )r   r  r  r   r   r5  r  r  )r  rX  r   r  files_contentr  r  s     `    r|   r  r    s   
   .>|.L.R.R.T.T  J  +D11M!+!1!1!3!3 @ @X+""{*'D'D((;#4#4S#9#9#2##>??K +""{*'D'Dh&&z+'>????ZZ(;(;(=(=ZZZZr{   rw  r  )TT)FNTru  )NNNNNNNFNNNr   )r  r  r~  r>  importlib.metadatar  r  r  r8  rw   r   r  r  r8  
contextlibr   dataclassesr   r   r   r   	threadingr   typingr	   r
   r   r   r   r   r   r   zipfiler   rs   huggingface_hubr   	packagingr   r   r   torch.nnr   r   torch.utils.checkpointr   activationsr   configuration_utilsr   dynamic_module_utilsr   
generationr   r   integrationsr   r    r!   loss.loss_utilsr"   pytorch_utilsr#   r$   r%   r&   r'   r(   r)   r*   
quantizersr+   r,   quantizers.quantizers_utilsr-   safetensors_conversionr.   r  r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   	utils.hubrP   rQ   rR   utils.import_utilsrS   rT   rU   rV   utils.quantization_configrW   rX   rx   ry   upperrY   r[   ri   r\   r]   r^   accelerate.hooksr_   accelerate.utilsr`   ra   rb   rc   rd   re   rf   rg   rh   rf  r
  r  accelerate.utils.modelingrk   safetensorsrl   safetensors.torchrm   r<  rn   r  
get_loggerr   r   r   r}   r   !smdistributed.modelparallel.torchmodelparallelr  smdistributed.modelparallelr   SMP_VERSIONr  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  rU  r(  r   r   r=  rv  r~  r  r  r  r  r  r  r  r  r   r~  r  rz  r\  r/  r>  rS  rY  rb  r  r  r  r  rz   r{   r|   <module>r     s+             				           				 				    % % % % % % ! ! ! ! ! ! / / / / / / / / / /       I I I I I I I I I I I I I I I I I I I I        > > > > > >               / / / / / / / / - - - - - - ' ' ' ' ' ' 1 1 1 1 1 1 4 4 4 4 4 4 9 9 9 9 9 9 9 9 X X X X X X X X X X ) ) ) ) ) )	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 5 4 4 4 4 4 4 4 = = = = = = 3 3 3 3 3 3" " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "F g f f f f f f f f f            N M M M M M M M z~~nc2288::JNN#6<<BBDD   JTTTTTTTTTT333333
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 'y'9'A'A,'O'OPP]W]62222IIIIII >%%%%%%============		H	%	%       &333333333FFFFFF -k : :mgmF>S>S S % 0//////  wW*"w.g,0w.wgng,W*w.g,  $ 8 8 8 88%E")5G*G$H % % % %$ry:L/L)M $ $ $ $$.54F)F#G . . . .bN N N
/ 
/ 
/  (   @ LRgsS SS%,&'S9>sCxSadS S S SlRT RT RT RTn 7;	C C3+,CC 5el!234C 	C C C CL& & &$U\ c     ")    ,DSN ,S%,=N8O ,TYZ^_bcf_gZhjnorjsZsTt , , , ,>%T#c(^ %c5<>O9P %UZ[_`cdg`h[iknorks[sUt % % % %&K K K K\# # #.4 4 4> 
"&p7 p7 p7 p7f s Xc] c    tq tq tq tq tq tq tq tqp	w9' w9' w9' w9' w9'bi!1?NTd w9' w9' w9'ts (i(CDD &2*9*E*M*T*T[| +U + +O'
# # # # #	 # # #LB B B B Bbi B B BJ> > > > >	 > > >B 3 3 3 3 3k 3 3 3<r r r r r	 r r rj` ` ` ` `bi ` ` `F 	 d ry    <
 
 
[ [ [ [ [r{   