
    gZP                        d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZ ddlmZmZ ddlmZ dd	lmZmZmZmZmZmZ  e j        e          Z	 	 	 	 	 	 dde	j        j        dedeeej         f         deeeee!ee	j"        f         f                  deee                  deeee!ef         ee!ef         f                  deeeej         f                  de#fdZ$	 ddZ%ddZ&	 	 ddZ'd Z(d Z)dej        fdZ*d Z+dS )    N)deepcopy)DictListOptionalUnion)is_4bit_bnb_availableis_8bit_bnb_available   )dispatch_modelinit_empty_weights   )BnbQuantizationConfig)find_tied_parametersget_balanced_memoryinfer_auto_device_mapload_checkpoint_in_modeloffload_weightset_module_tensor_to_deviceFmodelbnb_quantization_configweights_location
device_mapno_split_module_classes
max_memoryoffload_folderoffload_state_dictc           
      4   |j         }|j        }	|	rt                      st          d          |rt	                      st          d          g }
t          t                    rCt          	                                          dk    rd 
                                D             }
|j        t          |           |_        |r|j                            |
           |j        }|j        g |_        |j        }|                    |           || _        |	| _        t#          |           }|j        dk    rt&                              d           t+          | ||          } |j        }|                                 
                                D ]\  }t1          fd	|D                       r|                    t4          j                   |j        t4          j        k    r\                    d
d                              dd          t=          | d          }||                    t4          j                   t5          j        |          r|                    |           |j        dk    rP|                      t4          j         !                                           t4          j         "                                 n_t4          j         #                                r2|                     t4          j         !                                           ntI          d          t&          %                    d|j         d           | S |tI          d| d          tM                      5  t+          | ||          } ddd           n# 1 swxY w Y   tO          | |||          |d(                                v rd}t1          fddD                       }tS          | ||j        |||j        |	o|           tU          | |          S )a}  
    This function will quantize the input model with the associated config passed in `bnb_quantization_config`. If the
    model is in the meta device, we will load and dispatch the weights according to the `device_map` passed. If the
    model is already loaded, we will quantize the model and put the model on the GPU,

    Args:
        model (`torch.nn.Module`):
            Input model. The model can be already loaded or on the meta device
        bnb_quantization_config (`BnbQuantizationConfig`):
            The bitsandbytes quantization parameters
        weights_location (`str` or `os.PathLike`):
            The folder weights_location to load. It can be:
            - a path to a file containing a whole model state dict
            - a path to a `.json` file containing the index to a sharded checkpoint
            - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint.
            - a path to a folder containing a unique pytorch_model.bin file.
        device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*):
            A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer
            name, once a given module name is inside, every submodule of it will be sent to the same device.
        no_split_module_classes (`List[str]`, *optional*):
            A list of layer class names that should never be split across device (for instance any layer that has a
            residual connection).
        max_memory (`Dict`, *optional*):
            A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
        offload_folder (`str` or `os.PathLike`, *optional*):
            If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
        offload_state_dict (`bool`, *optional*, defaults to `False`):
            If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if
            the weight of the CPU state dict + the biggest shard does not fit.

    Returns:
        `torch.nn.Module`: The quantized model
    zYou have a version of `bitsandbytes` that is not compatible with 8bit quantization, make sure you have the latest version of `bitsandbytes` installed.zYou have a version of `bitsandbytes` that is not compatible with 4bit quantization,make sure you have the latest version of `bitsandbytes` installed.r   c                      g | ]\  }}|d v 	|S ))diskcpu ).0keyvalues      P/var/www/html/ai-engine/env/lib/python3.11/site-packages/accelerate/utils/bnb.py
<listcomp>z+load_and_quantize_model.<locals>.<listcomp>i   s&    ___*#ueF^F^#F^F^F^    NmetazIt is not recommended to quantize a loaded model. The model should be instantiated under the `init_empty_weights` context manager.)modules_to_not_convertc              3       K   | ]}|v V  	d S Nr!   )r"   module_to_keep_in_fp32names     r%   	<genexpr>z*load_and_quantize_model.<locals>.<genexpr>   s*      ee6L)T1eeeeeer'   .weight .biascuda/No GPU found. A GPU is needed for quantization.zThe model device type is zE. However, cuda is needed for quantization.We move the model to cuda.za`weights_location` needs to be the folder path containing the weights of the model, but we found  )r   r   r   Tc              3   ^   K   | ]'}|t                                                    v V  (d S r+   )listvalues)r"   xr   s     r%   r.   z*load_and_quantize_model.<locals>.<genexpr>   s<      NNa4
 1 1 3 3444NNNNNNr'   r    r   )dtyper   r   keep_in_fp32_modulesoffload_8bit_bnb)r   offload_dir)+load_in_4bitload_in_8bitr	   ImportErrorr   
ValueError
isinstancedictlenkeysitemsskip_modulesget_keys_to_not_convertextendr;   is_loaded_in_4bitis_loaded_in_8bitget_parameter_devicetypeloggerwarningreplace_with_bnb_layerstorch_dtype
state_dictanytotorchfloat32r:   replacegetattris_floating_pointr2   current_deviceempty_cacheis_availableRuntimeErrorinfor   get_quantized_model_device_mapr7   r   r   )r   r   r   r   r   r   r   r   r>   r?   modules_on_cpur)   r;   model_devicer:   paramoffloadr-   s      `             @r%   load_and_quantize_modelrd   ,   s   X +7L*7L 
133 
R
 
 	
  
133 
Q
 
 	

 N*d## `JOO,=,=(>(>(B(B__
0@0@0B0B___ +3/Fu/M/M,  D,33NCCC4A 3;7942G!!"6777 +E*E'..LF""_	
 	
 	
 (/F_uvvv'3 ++--3355 		  		 KD%eeeePdeeeee  ''';%-//<<	266>>wKKD#E466E(///(//  &&JJuz0022333J""$$$$Z$$&& 	RHHUZ..001111PQQQ)(9 ) ) )	
 	
 	
 		! D  qA  D  D  D
 
 	

  !! 	 	+.G]  E	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
 4#!$;
 
 

 %**@VzO`O`ObObEbEb!%NNNNoNNNNN )5)1!8!M)5g		
 		
 		
 		
 e
WWWWs   0NNNc                   	 ht           j                                        r!dt           j                                        int	          d          t
                              d           t          t                    rdvrt          d          i }|
                    fd|                                 D                        |
                    fd|                                 D                        i }||d<   ||d	<   j        |d
<   dk    rt          | fdk    |d|}||d<   t          | fi |t          t                    r|j        j        z   		fd                                D             }dD ]H}||                                v r0j        rt          d          t
                              d           I~S )Nr0   r3   z\The device_map was not initialized.Setting device_map to `{'':torch.cuda.current_device()}`.)autobalancedbalanced_low_0
sequentialziIf passing a string for `device_map`, please choose 'auto', 'balanced', 'balanced_low_0' or 'sequential'.c                 h    i | ]-\  }t          fd j        D                       %j        .S )c              3       K   | ]}|v V  	d S r+   r!   r"   mr-   s     r%   r.   z<get_quantized_model_device_map.<locals>.<dictcomp>.<genexpr>   s'      OOQqDyOOOOOOr'   )rS   rG   rQ   r"   _r-   r   s     @r%   
<dictcomp>z2get_quantized_model_device_map.<locals>.<dictcomp>   s[       D!OOOO*A*NOOOOO-9  r'   c                 r    i | ]2\  }t          fd j        D                       %t          j        3S )c              3       K   | ]}|v V  	d S r+   r!   rl   s     r%   r.   z<get_quantized_model_device_map.<locals>.<dictcomp>.<genexpr>   s'      WWQqDyWWWWWWr'   )rS   r;   rU   rV   rn   s     @r%   rp   z2get_quantized_model_device_map.<locals>.<dictcomp>   sY       D!WWWW*A*VWWWWWem  r'   special_dtypesr   r:   ri   rh   )low_zeror   r   c                 *    i | ]}|v||         S r!   r!   )r"   r#   r   modules_not_to_converts     r%   rp   z2get_quantized_model_device_map.<locals>.<dictcomp>   s1     +
 +
 +
%(LbAbAbCCAbAbAbr'   r9   aH  
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in `torch_dtype`, you need to pass a custom `device_map` to
                        `load_and_quantize_model`. Check
                        https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization#offload-modules-to-cpu-and-disk
                        for more details.
                        ziSome modules are are offloaded to the CPU or the disk. Note that these modules will be converted to 8-bit)rU   r2   r\   rZ   r]   rN   r^   rB   strrA   updatenamed_parameterstarget_dtyper   r   rC   rG   r;   rE   r7   r>   )
r   r   r   r   r   rs   kwargsdevice_map_without_some_modulesdevicerv   s
    ``      @r%   r_   r_      s    :""$$ 	Rej7799:JJPQQQuvvv*c"" &<QQQ   
    $5577  	
 	
 	
 	   $5577  	
 	
 	
 #1 ,C()1>w %%,$(88%  	 J  *|*5;;F;;
*d## ,!8!EH_Ht!t+
 +
 +
 +
 +
,6OO,=,=+
 +
 +
' & 	 	F8??AAAA*7 $	 	 	 KK D   ,r'   c                 p    |g }t          | |||          \  } }|st                              d           | S )a,  
    A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules or by `bnb.nn.Linear4bit`
    modules from the `bitsandbytes`library. The function will be run recursively and replace `torch.nn.Linear` modules.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`List[str]`):
            Names of the modules to not quantize convert. In practice we keep the `lm_head` in full precision for
            numerical stability reasons.
        current_key_name (`List[str]`, *optional*):
            An array to track the current key of the recursion. This is used to check whether the current key (part of
            it) is not in the list of modules to not convert.
    Na  You are loading your model in 8bit or 4bit but no linear modules were found in your model. this can happen for some architectures such as gpt2 that uses Conv1D instead of Linear layers. Please double check your model architecture, or submit an issue on github if you think this is a bug.)_replace_with_bnb_layersrN   rO   )r   r   r)   current_key_namehas_been_replaceds        r%   rP   rP     s]      %!#7&(>@P   E  
	
 	
 	
 Lr'   c           	         ddl }d}|                                 D ]\  }}|g }|                    |           t          |t          j                  r'||vr"d                    |          }d}	|D ]}
|
|v r|
dz   |v s|
|k    rd}	 n|	r|j        r6|j                            |j	        |j
        |j        dud|j                  }nW|j        rA|j                            |j	        |j
        |j        du|j        |j        |j                  }nt%          d          |j        j        |j        _        |j        |j        j        |j        _        |                    d           t-          | ||           d}t/          t1          |                                                    dk    rt5          ||||          \  }}||z  }|                    d	           | |fS )
z
    Private method that wraps the recursion for module replacement.

    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
    r   NF.T)has_fp16_weights	threshold)compress_statistics
quant_typez1load_in_8bit and load_in_4bit can't be both False)bitsandbytesnamed_childrenappendrB   nnLinearjoinr?   Linear8bitLtin_featuresout_featuresbiasllm_int8_thresholdr>   
Linear4bitbnb_4bit_compute_dtypebnb_4bit_use_double_quantbnb_4bit_quant_typerA   weightdatarequires_grad_setattrrD   r6   childrenr   pop)r   r   r)   r   bnbr   r-   modulecurrent_key_name_strproceedr#   
bnb_modulero   _has_been_replaceds                 r%   r   r   2  sS    ,,.. /! /!f#!%%%fbi(( $	)T9O-O-O#&88,<#=#= G-  000sSyDX7X7X000#GE 1  )*7 Z!$!4!4*+4/)."9"L "5 " "JJ -9 
Z!$!2!2*+4//F,C,]#:#N "3 " "JJ %%XYYY)/);
!&;*+1;+;JO())%000tZ000$(!tFOO%%&&''!++$</1GIY% %!A! !24F FR    ###r'   c                 r   t                      5  t          |           }ddd           n# 1 swxY w Y   t          |          }t          |t                    rRt          t          |                                          g           t          |                                          z   }nt          |g           }t          |          dk    }d}t          | d          rt          | | j                   }|s|rg S t          |                                           }|d         d         g}t          |          t          |          z
  }t          t          |                    t          |          z   }	ddg}
g }|	D ]6}|
D ]}||v r|                    |d          }|                    |           7|S )	a  
    An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules
    we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want
    to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in
    int8.

    Parameters:
    model (`torch.nn.Module`):
        Input model
    Nr   Fbase_model_prefixr   r/   r1   r0   )r   r   r   rB   rC   sumr6   r7   rE   rD   hasattrr   r   setrW   r   )r   
tied_modeltied_params	tied_keyshas_tied_paramsis_base_modellist_moduleslist_last_moduleintersectionlist_untouchednames_to_removefiltered_module_namesr-   name_to_removes                 r%   rH   rH   t  s	    
		 % %e__
% % % % % % % % % % % % % % % 'z22K+t$$ )[//1122B77${?O?O?Q?Q:R:RR		R((	)nnq(O Mu)** D#E5+BCCC   	 ,,..//L$R(+, '((3y>>9L#i..))D,>,>>N !'*O + +- 	8 	8N%%||NB77$$T****  s   +//c                 v    ddl }|                                 D ]}t          ||j        j                  r dS  dS )zUCheck if we have `bnb.nn.Linear4bit` or `bnb.nn.Linear8bitLt` layers inside our modelr   NTF)r   modulesrB   r   r   )r   r   rm   s      r%   has_4bit_bnb_layersr     sO     ]]__  a*++ 	44	5r'   	parameterc                 N    t          |                                           j        S r+   )next
parametersr}   )r   s    r%   rL   rL     s    	$$&&''..r'   c           
         |t          | |d||           |}| }d|v rS|                    d          }	|	d d         D ]+}
t          ||
          }|t          | d|
 d          |},|	d         }d|j        |         _        t          |j        |         |||           t          |j        |         d          r7t          |j        |         j        |	                    d	d          ||           n:t          ||||           t          ||	                    d	d          ||           t          | |d
|t          j        |                                            d S )Nr   )r:   r$   r   r   z has no attribute F)indexSCBr   r(   )r   splitrX   rA   _parametersrequires_gradr   r   r   rW   rU   emptysize)r   rb   
param_name	new_dtyper   offload_indexfp16_statisticstensor_namer   splitsr   
new_modules               r%   quantize_and_offload_8bitr     s   #E:q	QVWWWW + &&s++F $ $$VU33
%$%J%J%%J%J%JKKK# *K8=;'5v)+6
NZghhhh6%k2E:: 	";/3""8U33#	    	uj.NNNN
(:(:8U(K(K^cpqqqqz6RWR]_d_i_i_k_kRlmmmmmmr'   )NNNNNF)NNN)NN),loggingoscopyr   typingr   r   r   r   rU   torch.nnr   accelerate.utils.importsr   r	   big_modelingr   r   dataclassesr   modelingr   r   r   r   r   r   	getLogger__name__rN   Modulerw   PathLikeintr}   boolrd   r_   rP   r   rH   r   rL   r   r!   r'   r%   <module>r      s     				       . . . . . . . . . . . .              
 > = = = = = = = . . . . . .                
	8	$	$ 15EI37CG8<$UX UX8?UX2UX C,-UX c5c5<)?#@@AB	UX
 &d3i0UX eCHouS#X>?@UX U3#345UX UX UX UX UXr _cK K K K\   F  	?$ ?$ ?$ ?$D1! 1! 1!h  /BI / / / /n n n n nr'   