
    gdP              
          d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZmZmZmZ  e            r
ddlZddlmZ  ej        e          Zg d	g d
g dddg d	g dg ddddg d	g d
g dddg d	g d
g ddddZdddddddddddddddddddddddddZd Z	 	 	 	 d$defdZd Zd Zd Zd  Z d! Z!d" Z"d# Z#dS )%z;AWQ (Activation aware Weight Quantization) integration file    N)version   )ACT2FN)PreTrainedModel)is_auto_awq_availableis_ipex_availableis_torch_availablelogging)AwqBackendPackingMethod	AwqConfigAWQLinearVersionExllamaVersion)q_projk_projv_projo_proj)	gate_projup_proj	down_proj)input_layernormpost_attention_layernormnormF)	attentionmlp	layernorm	use_alibi)w1w3w2g    .A)r   r   r   r   
rope_theta)mistralmixtralllamallavaactc_fc)r%   layer_before_actdense_h_to_4hr   fc_in	gelu_impl)
starcoder2RefinedWebModelfalconmptgptjgpt_neoxgpt_bigcodebloomc                    ddl m} |t          vr| S |                                 D ]\  }}t          |         d         }t          |         d         }||k    r`t	          | |          rPt          | t          |         d                   }|j        }t          j        |          }	 |||	          | j	        |<   t          ||          }
| S )Nr   )ScaledActivationr%   r'   )awq.modules.actr4   AWQ_SCALES_MAPPINGSnamed_childrenhasattrgetattrout_featurestorchones_modulesreplace_quantization_scales)model
model_typer4   namemoduleact_namelayer_before_act_namer'   size
scale_like_s              Y/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/integrations/awq.pyr>   r>   M   s    000000,,,,,.. < <f&z259 3J ?@R S8/D E E&u.A*.MN`.abb#0DD))J#3#3FJ#G#GEN4 '
;;L    returnc           	         |g }|j         }t                      st          d          |t          j        k    r|j        t          j        k    r	ddlm	} |}n|j        t          j
        k    r	ddlm} |}n|j        t          j        k    re|j        d         t          j        k    r	ddlm}	 |	}n~|j        d         t          j        k    r	ddlm}
 |
}nZt          d	|j        d                    |j        t          j        k    r	dd
lm} |}nt          d|j                   ddlm} |}|                                 D ]\  }}g                     |           t;          |t<          j                  r||vrtA          fd|D                       sd|j!        }|j"        } ||j#        |j$        |||j%        du|j&        j'                  | j(        |<   d}| j(        |         )                    d           tU          tW          |,                                                    dk    rt[          ||||          \  }}.                    d           | |fS )a  
    Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
    conversion has been successfull or not.

    During the module replacement, we also infer the backend to use through the `quantization_config` object.

    Args:
        model (`torch.nn.Module`):
            The model to convert, can be any `torch.nn.Module` instance.
        quantization_config (`AwqConfig`):
            The quantization config object that contains the quantization parameters.
        modules_to_not_convert (`list`, *optional*):
            A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
            converted.
        current_key_name (`list`, *optional*):
            A list that contains the current key name. This is used for recursion and should not be passed by the user.
        has_been_replaced (`bool`, *optional*):
            A boolean that indicates if the conversion has been successful or not. This is used for recursion and
            should not be passed by the user.
    NzAWQ (either `autoawq` or `llmawq`) is not available. Please install it with `pip install autoawq` or check out the installation guide in https://github.com/mit-han-lab/llm-awqr   )WQLinear_GEMM)WQLinear_GEMVr   )WQLinear_Exllama)WQLinear_ExllamaV2Unrecognized Exllama version: WQLinear_IPEXzUnrecognized AWQ version: )WQLinearc              3   F   K   | ]}|d                                v V  dS ).N)join).0keycurrent_key_names     rH   	<genexpr>z*replace_with_awq_linear.<locals>.<genexpr>   s5      [[Sschh'7888[[[[[[rI   )w_bit
group_sizein_featuresr:   biasdevTF)modules_to_not_convertrY   quantization_confighas_been_replaced)/backendr   
ValueErrorr   AUTOAWQr   r   GEMMawq.modules.linear.gemmrL   GEMVawq.modules.linear.gemvrM   EXLLAMAexllama_configr   ONEawq.modules.linear.exllamarN   TWOawq.modules.linear.exllamav2rO   IPEXawq.modules.linear.gemm_ipexrR   awq.quantize.qmodulerS   r7   append
isinstancennLinearanyr]   r:   bitsr\   r^   weightdevicer=   requires_grad_lenlistchildrenreplace_with_awq_linearpop)r?   r`   ra   rY   rb   rd   rL   
target_clsrM   rN   rO   rR   rS   rA   rB   r]   r:   rG   s      `              rH   r   r   ^   s   8 %!#!)G "" 
 ~
 
 	
 )111&*:*???======&JJ (,<,AAA======&JJ (,<,DDD"1)<@RRRGGGGGG-

$3I>.BTTTKKKKKK/

 !qBUBdenBo!q!qrrr (,<,AAABBBBBB&JJW:M:UWWXXX111111
,,..  !  !f#!%%%fbi(( 	;T9O-O-O[[[[DZ[[[[[ ;$0%2'1z-22= +!-D0,( ( (t$ %)! t$33E:::tFOO%%&&''!++#:'=!1$7"3$ $ $ A  	R    ###rI   c                    t          | t                    st          d| j        j                   |j        |j        }|j        |d<   n| j        j        t          v rkt          | j        j                 }| j        
                    d          }|j        }|j        }t          |d|          }||d<   ||d<   ||d<   |j        |d<   nt          d	          |S )
af  
    Returns the fusing mapping given the quantization config and the model

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`~transformers.quantization_config.AWQConfig`):
            The quantization configuration to use.
    z:The model should be an instance of `PreTrainedModel`, got Nmax_seq_lenTdecodernum_key_value_headshidden_sizenum_attention_headsa  Fusing mapping not found either on the quantization config or the supported `AWQ_FUSED_MAPPINGS`. Please pass a `fused_mapping` argument in the `quantization_config` or raise an issue on transformers https://github.com/huggingface/transformers to add its support.)ru   r   	TypeError	__class____name__modules_to_fusefuse_max_seq_lenconfigr@   AWQ_FUSED_MAPPINGSget_text_configr   r   r9   re   )r?   ra   current_fused_mappingr   r   r   r   s          rH   get_modules_to_fuser      s    e_-- qoUZUdUmooppp *6 3 C/B/Sm,,		 $6	6	6 25<3J K --d-;; ($8%f.CEXYY 0;m,7J347J34/B/Sm,,N
 
 	
 ! rI   c                   
 t          |t                    rt          j        |          }|j        }t          | |          }t          |dd          }|t          j        k    rddl	m
} ddlm} ddlm} nt          d          g 
|                                 D ]\  }|t#          fd|D                       r#t%          |d	         ||           |j        d
k    rt)          | |d         ||           nt*                              d           t/          | |||          }	|	r.
                                        d          d                    t5          
          dk    rg|                                 D ]R\  }t#          
fd
D                       r1t7          |d          r!t7          |j        d          rd|j        _        S| S )aJ  
    Optionally fuse some modules in the model to speedup inference.

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`Union[AwqConfig, dict]`):
            The quantization configuration to use.
    r`   Nr   )QuantAttentionFused)QuantFusedMLP)FasterTransformerRMSNormz0Fusing is only supported for the AutoAWQ backendc              3       K   | ]}|v V  	d S N )rW   module_name_to_not_convertrA   s     rH   rZ   z#fuse_awq_modules.<locals>.<genexpr>  s*      oo:T-5oooooorI   r   ipexr   z7The IPEX version AWQ does not support fuse mlp for now.rU   c              3       K   | ]}v V  	d S r   r   )rW   fused_attention_parent_modulefused_attention_modulesmodule_names     rH   rZ   z#fuse_awq_modules.<locals>.<genexpr>(  s:        ;X66     rI   r   _attn_implementationcustom)ru   dictr   	from_dictrd   r   r9   r   rf   awq.modules.fused.attnr   awq.modules.fused.mlpr   awq.modules.fused.normr   re   named_modulesrx   _fuse_awq_layernormr   _fuse_awq_mlploggerinfo_fuse_awq_attention_layersrt   splitr}   r8   r   r   )r?   ra   rd   r   r`   r   r   r   rB   attention_has_been_fusedr   r   rA   s             @@@rH   fuse_awq_modulesr      s\    %t,, G'12EFF!)G)%1DEEO$%8:RTXYY)111>>>>>>777777CCCCCCCKLLL ++-- ? ?f!-ooooXnooooo  	OK8&BZ[[[ &&00%u'=v}UUUUKKQRRR $>6?D2E$
 $
  $ 	?#**4::c??1+=>>>
 "##a''#(#6#6#8#8 	B 	BK     \s     B 68,, BH^1_1_ B9AFM6LrI   c                     | D ]^}t          ||          rLt          ||          } ||j        |j                                      |j        j                  |j        |<   ~_dS )a  
    Fuse the LayerNorm layers into a target class using autoawq

    Args:
        fuse_module_names (`List[str]`):
            The list of module names to fuse
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.FasterTransformerRMSNorm`):
            The `FasterTransformerRMSNorm` class as it only supports that class
            for now.
    N)r8   r9   rz   variance_epsilontor{   r=   )fuse_module_namesrB   r   r   
old_modules        rH   r   r   0  s     )  6;'' 	 55J+5:!+, , b")** OK(  rI   c                     t          |          dk    rdS t          ||d                   rt          ||d                   }t          ||d                   }t          ||d                   }|j        j        }| j                            d          }	|	j        }
t          |
         } |||||          }|	                    dd          \  }}| 
                    |          }t          |||                    |                     ~~~dS dS )a  
    Fuse the MLP layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        current_module_name (`str`):
            The current submodule name
        fuse_module_names (`List[str]`):
            The list of module names to fuse. For the MLP layers it has to be an array
            of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.QuantFusedMLP`):
            The `QuantFusedMLP` class as it only supports that class
            for now.
    r   N   r   Tr   rU   )r}   r8   r9   qweightr{   r   r   
hidden_actr   rsplitget_submodulesetattrr   )r?   current_module_namer   rB   r   r   r   r   previous_devicer   r   activation_fn
new_moduleparent_name
child_nameparents                   rH   r   r   G  s    $ ""v(+,, *F$5a$899	&"3A"677F$5a$899	#+2 --d-;;&
z*Z	9g}MM
"5"<"<S!"D"DZ$$[11
JMM/$B$BCCCw			#* *rI   c                    ddl m}m} d}t          |d                   dk    r|S t	          ||d         d                   r#t          ||d         d                   }t          ||          r|}	d}
nt          ||          r|}	d}
nt                      rbt          j	        t          j                            d                    t          j	        d          k    rddl m} t          ||          r|}	d}
nt          d	          |j        j        }t          ||d         d                   }t          ||d         d
                   }t          ||d         d                   }|j        (t#          j        |j        |j        |j        gd          nd} |	|j        |j        |j        |j        |j        z   |j        z   |j        dut/          t1          |                                                                                    j                  }t#          j        |j        |j        |j        g|
          |_        t#          j        |j        |j        |j        g|
          |_        t#          j        |j        |j        |j        g|
          |_        t          ||          r|j        |_        ||_         ||d         |d         |d         ||||d         |d         |                    dd          	  	        }d|_        |                     dd          \  }}| !                    |          }tE          |||#                    |                     ~~~~d}|S )a  
    Fuse the Attention layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        modules_to_fuse (`List[str]`):
            The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
            in the correct order: q, k, v, o layer
        current_module_name (`str`):
            The current submodule name
        target_cls (`~autoawq.QuantAttentionFused`):
            The `QuantAttentionFused` class as it only supports that class
            for now.
    r   )rL   rM   Fr   r   autoawqz0.2.6rQ   z'Unsupported q_proj type: {type(q_proj)}r      N)dimr   r   r   r   r   r    g     @)r   r    TrU   )$awq.modules.linearrL   rM   r}   r8   r9   ru   r   r   parse	importlibmetadatarR   re   r   r{   r^   r;   catr[   r\   r]   r:   nextiter
state_dictvaluesqzerosscalessplit_k_itersgetis_hf_transformersr   r   r   r   )r?   rB   r   r   r   rL   rM   module_has_been_fusedr   linear_target_clscat_dimrR   r   r   r   r   r^   	qkv_layerfused_attention_layerr   r   r   s                         rH   r   r   p  s   $ @???????!
?;'((A--$$v{3A677 A%!=a!@AAfm,, 	H -GG.. 
	H -GG   	HW]93E3M3Mi3X3X%Y%Y\c\ijq\r\r%r%r888888&-00 $1!FGGG ./!=a!@AA!=a!@AA!=a!@AALRKLcuy&+v{FK@aHHHHim%%L&"558KKKt#f''))00223344;
 
	 "Iv~v~v~&V\cddd	 9fmV]FM%RX_```	 9fmV]FM%RX_```	i// 	;&,&:I#	 *
M*1212M*%k2&**<AA!
 !
 !
 480"5"<"<S!"D"DZ$$[11
$9$<$<_$M$MNNNFFF $  rI   c                     |d         t           j        k    rddlm}  ||           } nO|d         t           j        k    r!ddlm}  || |d         |d                   } nt          d|d                    | S )	z
    Runs post init for Exllama layers which performs:
        - Weights unpacking, reordering and repacking
        - Devices scratch space allocation
    r   r   )exllama_post_init)exllamav2_post_initmax_input_lenmax_batch_size)r   r   rP   )r   rm   rn   r   ro   rp   r   re   )r?   rl   r   r   s       rH   post_init_awq_exllama_modulesr     s     i N$666@@@@@@!!%((			"n&8	8	8DDDDDD##(9)*:;
 
 
 U.:SUUVVVLrI   c                 (    ddl m}  ||           } | S )zl
    Runs post init for IPEX layers which performs:
        - Weights packing, reordering and repacking
    r   )ipex_post_init)rr   r   )r?   r   s     rH   post_init_awq_ipex_modulesr     s*     <;;;;;N5!!ELrI   )NNNF)$__doc__r   	packagingr   activationsr   modeling_utilsr   utilsr   r   r	   r
   utils.quantization_configr   r   r   r   r;   torch.nnrv   
get_loggerr   r   r   r6   r>   boolr   r   r   r   r   r   r   r   r   rI   rH   <module>r      s   > =                       , , , , , , Y Y Y Y Y Y Y Y Y Y Y Y             LLL		H	%	% >==444LLL	  >==!!!LLL  >==444LLL	  >==444LLL	 )  :  V<<$/JJAAi88w77?CC f== oFF	 	   &  f$ f$ 
f$ f$ f$ f$R&! &! &!R= = =@  .&* &* &*R\! \! \!~  2
 
 
 
 
rI   