
    g                         d dl Z d dlmZmZmZmZmZ d dlmZ ddl	m
Z
 erddlmZ ddlmZmZmZmZ dd	lmZ  e            rd dlZ ej        e          Z G d
 de
          ZdS )    N)TYPE_CHECKINGAnyDictListOptional)version   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_namec                   ,    e Zd ZdZdZdZddgZ fdZd Zd dZ	ddddde
dee
ef         fdZ	 d!ddddde
dddee
ef         deee
                  fdZd"dZg fdddee
         fdZdee
         de
d
ee
         fdZd!dZed
efd            Z xZS )#FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    TFz
fbgemm-gpu
acceleratec                 J     t                      j        |fi | || _        d S N)super__init__quantization_config)selfr   kwargs	__class__s      h/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   zFbgemmFp8HfQuantizer.__init__.   s1    ,77777#6       c                 4   t                      rGt          j        t          j                            d                    t          j        d          k     rt          d          t                      st          d          t          d          st          d          t          j	        
                                st          d          t          j	                                        }|\  }}|dk     rt          d	          |                    d
d           }|t                              d           d S |W| j        sRt%          |t&                    r?d|                                v sd|                                v rt          d          d S d S d S d S )Ntorchz2.1.0z~Using fbgemm fp8 quantization requires torch > 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   r   parse	importlibmetadataImportErrorr   r   r    cudais_availableRuntimeErrorget_device_capability
ValueErrorgetloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr"   s          r   validate_environmentz)FbgemmFp8HfQuantizer.validate_environment2   s   !## 	w}Y5G5O5OPW5X5X'Y'Y\c\ijq\r\r'r'r]   '(( 	F  
 'x00 	r   z&&(( 	`^___"Z==??)u199j   ZZd33
|     #&	z400	 j//1111Vz?P?P?R?R5R5R n   $#	 	 	 	 6S5Rr   torch_dtypetorch.dtypereturnc                     |(t           j        }t                              d|           n|t           j        k    rt          d          |S )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.zeYou cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16)r    bfloat16r/   infofloat16r-   )r   r:   s     r   update_torch_dtypez'FbgemmFp8HfQuantizer.update_torch_dtype_   sd    .KKKF     EM))F   r   modelr   param_valueztorch.Tensor
param_name
state_dictc                     ddl m} t          ||          \  }}t          ||          rP| j        s|dk    r,|dk    r$|j        t          j        k    rt          d          dS |dk    rt          d          d	S dS )
Nr   FbgemmFp8Linearbiasweightz6Expect quantized weights but got an unquantized weightFweight_scalez;Expect unquantized weights but got a quantized weight_scaleT)	integrationsrH   r   r2   r1   dtyper    float8_e4m3fnr-   )	r   rB   rC   rD   rE   r   rH   moduletensor_names	            r   check_quantized_paramz*FbgemmFp8HfQuantizer.check_quantized_paramp   s     	32222225*EEfo.. 	! [F%:%:(**{/@EDW/W/W$%]^^^u.00$%bccctur   Ntarget_deviceztorch.deviceunexpected_keysc                 d   t           j        j                            |          \  }}t	          ||          \  }	}
|                    |          |	j        |
<   |                    |j        d         d                              |          |	j        d<   |||v r|	                    |           ~dS )z@
        Quantizes weights into weight and weight_scale
        r   r	   rK   N)
r    opsfbgemmquantize_fp8_per_rowr   to_buffersviewshaperemove)r   rB   rC   rD   rR   rE   rS   	new_valuerK   rO   rP   s              r   create_quantized_paramz+FbgemmFp8HfQuantizer.create_quantized_param   s     #()"2"G"G"T"T	<25*EE'0||M'B'B$*6*;*;L<Nq<QST*U*U*X*XYf*g*g'&:+H+H"":...JJr   c                     |S r    )r   rB   r   s      r   #_process_model_after_weight_loadingz8FbgemmFp8HfQuantizer._process_model_after_weight_loading   s    r   keep_in_fp32_modulesc                     ddl m}m}  ||          | _        | j        j        $| j                            | j        j                    ||| j        | j        | j                  }| j        |j        _        d S )Nr   )get_keys_to_not_convertreplace_with_fbgemm_fp8_linear)modules_to_not_convertr   r1   )rL   rd   re   rf   r   extendr1   config)r   rB   r"   rb   r   rd   re   s          r   $_process_model_before_weight_loadingz9FbgemmFp8HfQuantizer._process_model_before_weight_loading   s     	[ZZZZZZZ&=&=e&D&D##:F'..t/G/^___..#'#> $ 8,	
 
 
 ,0+C(((r   missing_keysprefixc                 &   ddl m} g |                                D ]f\  }}t          ||          rQ|D ]N}||v s	|| d| v r?|                    d          s*|                    d          s                    |           Ogfd|D             S )Nr   rG   .z.weightz.biasc                     g | ]}|v|	S r`   r`   ).0knot_missing_keyss     r   
<listcomp>z<FbgemmFp8HfQuantizer.update_missing_keys.<locals>.<listcomp>   s$    EEEa14D+D+D+D+D+Dr   )rL   rH   named_modulesr2   endswithappend)	r   rB   rj   rk   rH   namerO   missingrq   s	           @r   update_missing_keysz(FbgemmFp8HfQuantizer.update_missing_keys   s    222222!//11 	9 	9LD&&/22 9+ 9 9GDv4I4I4I4I,I,I ' 0 0 ; ; -J ' 0 0 9 9 -J )//888EEEE<EEEEr   c                     dS )NTr`   )r   safe_serializations     r   is_serializablez$FbgemmFp8HfQuantizer.is_serializable   s    tr   c                     dS )NFr`   )r   s    r   is_trainablez!FbgemmFp8HfQuantizer.is_trainable   s    ur   )r:   r;   r<   r;   r   )rB   r   )__name__
__module____qualname____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r9   rA   strr   r   rQ   r   r   r^   ra   ri   rx   r{   propertyboolr}   __classcell__)r   s   @r   r   r   $   s         (,$ %|47 7 7 7 7+ + +Z   "  $ 	
 cN   < 04   $ 	
 & cN "$s),   .    +-	D D D #3i	D D D D.FtCy F# FRVWZR[ F F F F    d    X    r   r   )r&   typingr   r   r   r   r   	packagingr   baser
   modeling_utilsr   utilsr   r   r   r   quantizers_utilsr   r    
get_loggerr~   r/   r   r`   r   r   <module>r      s!       ; ; ; ; ; ; ; ; ; ; ; ; ; ;              1000000 a a a a a a a a a a a a 2 2 2 2 2 2  LLL 
	H	%	%h h h h h; h h h h hr   