
    Ng"                     ~    d dl Z d dlmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ dZdZd	Z G d
 de	          ZdS )    N)AnyListMappingOptional)CallbackManagerForLLMRun)LLM)
ConfigDict)enforce_stop_tokenszgoogle/flan-t5-largetext2text-generation)r   text-generationsummarizationc                      e Zd ZU dZdZeed<   eZe	ed<   	 dZ
ee         ed<   	 dZee         ed<   	  ed          Ze	 	 	 	 	 	 	 dde	de	dee         dee	         dee         dee         dee         dee         dee         dedefd            Zedee	ef         fd            Zede	fd            Z	 	 dde	deee	                  dee         dede	f
dZdS )WeightOnlyQuantPipelinea  Weight only quantized model.

    To use, you should have the `intel-extension-for-transformers` packabge and
        `transformers` package installed.
    intel-extension-for-transformers:
        https://github.com/intel/intel-extension-for-transformers

    Example using from_model_id:
        .. code-block:: python

            from langchain_community.llms import WeightOnlyQuantPipeline
            from intel_extension_for_transformers.transformers import (
                WeightOnlyQuantConfig
            )
            config = WeightOnlyQuantConfig
            hf = WeightOnlyQuantPipeline.from_model_id(
                model_id="google/flan-t5-large",
                task="text2text-generation"
                pipeline_kwargs={"max_new_tokens": 10},
                quantization_config=config,
            )
    Example passing pipeline in directly:
        .. code-block:: python

            from langchain_community.llms import WeightOnlyQuantPipeline
            from intel_extension_for_transformers.transformers import (
                AutoModelForSeq2SeqLM
            )
            from intel_extension_for_transformers.transformers import (
                WeightOnlyQuantConfig
            )
            from transformers import AutoTokenizer, pipeline

            model_id = "google/flan-t5-large"
            tokenizer = AutoTokenizer.from_pretrained(model_id)
            config = WeightOnlyQuantConfig
            model = AutoModelForSeq2SeqLM.from_pretrained(
                model_id,
                quantization_config=config,
            )
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=10,
            )
            hf = WeightOnlyQuantPipeline(pipeline=pipe)
    Npipelinemodel_idmodel_kwargspipeline_kwargsallow)extraFtaskdevice
device_mapload_in_4bitload_in_8bitquantization_configkwargsreturnc
           	         |*t          |t                    r|dk    rt          d          t          j                            d          t          d          	 ddlm}m} ddl	m
} dd	lm} dd
lm} n# t          $ r t          d          w xY wt          |t                    r2|dk    r, |            st          d          dt          |          z   }nt          |t                    r|dk     rd}||d}|pi } |j        |fi |}	 |dk    r |j        |f|||	d|d|}n4|dv r |j        |f|||	d|d|}nt          d| dt"           d          n&# t          $ r}t          d| d          |d}~ww xY wd|v rd |                                D             }|pi } |d|||||d|}|j        t"          vr t          d|j         dt"           d           | d||||d|
S )z5Construct the pipeline object from model_id and task.Nr   z7`Device` and `device_map` cannot be set simultaneously!torchz;Weight only quantization pipeline only support PyTorch now!r   )AutoModelForCausalLMAutoModelForSeq2SeqLM)is_ipex_available)AutoTokenizer)r   zCould not import transformers python package. Please install it with `pip install transformers` and `pip install intel-extension-for-transformers`.z)Don't find out Intel GPU on this machine!zxpu:cpur   F)r   r   r   use_llm_runtimer   )r   r   Got invalid task , currently only  are supportedzCould not load the z# model due to missing dependencies.trust_remote_codec                 &    i | ]\  }}|d k    ||S )r*    ).0kvs      m/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/llms/weight_only_quantization.py
<dictcomp>z9WeightOnlyQuantPipeline.from_model_id.<locals>.<dictcomp>   s0       A!?R:R:R1:R:R:R    )r   model	tokenizerr   r   )r   r   r   r   r,   )
isinstanceint
ValueError	importlibutil	find_spec-intel_extension_for_transformers.transformersr!   r"   ,intel_extension_for_transformers.utils.utilsr#   transformersr$   r   ImportErrorstrfrom_pretrainedVALID_TASKSitemsr   )clsr   r   r   r   r   r   r   r   r   r   r!   r"   r#   r$   hf_pipeline_model_kwargsr4   r3   e_pipeline_kwargsr   s                         r0   from_model_idz%WeightOnlyQuantPipeline.from_model_idO   s    !z&#'>'>!6B;;VWWW>##G,,4M  	        WVVVVV222222<<<<<<< 	 	 	F  	 fc"" 	v{{$$&& N !LMMM#f++-JJ$$ 	!F>!"
$*1M1(LLmLL		(((<,<!-!-(;$))  $  BBB=-=!-!-(;$))  $  !B B B&1B B B    	 	 	OdOOO 	
 -// !.!4!4!6!6  M +0b; 
&
 
 
 
 =++>HM > >"-> > >   s 
&,	
 

 
 
 	
s%   A7 7BAE! !
F+E??Fc                 ,    | j         | j        | j        dS )zGet the identifying parameters.r   r   r   rJ   selfs    r0   _identifying_paramsz+WeightOnlyQuantPipeline._identifying_params   s$      -#3
 
 	
r2   c                     dS )zReturn type of llm.weight_only_quantizationr,   rK   s    r0   	_llm_typez!WeightOnlyQuantPipeline._llm_type   s
     *)r2   promptstoprun_managerc                    |                      |          }| j         j        dk    r$|d         d         t          |          d         }nc| j         j        dk    r|d         d         }nD| j         j        dk    r|d         d         }n%t          d| j         j         d	t           d
          |rt          ||          }|S )ab  Call the HuggingFace model and return the output.

        Args:
            prompt: The prompt to use for generation.
            stop: A list of strings to stop generation when encountered.

        Returns:
            The generated text.

        Example:
            .. code-block:: python

                from langchain_community.llms import WeightOnlyQuantPipeline
                llm = WeightOnlyQuantPipeline.from_model_id(
                    model_id="google/flan-t5-large",
                    task="text2text-generation",
                )
                llm.invoke("This is a prompt.")
        r   r   generated_textNr   r   summary_textr'   r(   r)   )r   r   lenr7   rA   r
   )rL   rQ   rR   rS   r   responsetexts          r0   _callzWeightOnlyQuantPipeline._call   s    4 ==((=!222A;/0V?DD]#999A;/0DD]?22A;~.DD>DM$6 > >"-> > >    	3 'tT22Dr2   )r   NNNFFN)NN)__name__
__module____qualname____doc__r   r   __annotations__DEFAULT_MODEL_IDr   r?   r   r   dictr   r	   model_configclassmethodr6   boolr   rH   propertyr   rM   rP   r   r   rZ   r,   r2   r0   r   r      s        / /b Hc$Hc$$$*#'L(4.'''1&*OXd^***4:  L 
 !#$('+*.',',-1h
 h
h
 h
 	h

 SMh
 tnh
 "$h
 tnh
 tnh
 &c]h
 h
 
h
 h
 h
 [h
T 
WS#X%6 
 
 
 X
 *3 * * * X* %):>	+ ++ tCy!+ 67	+
 + 
+ + + + + +r2   r   )r8   typingr   r   r   r    langchain_core.callbacks.managerr   #langchain_core.language_models.llmsr   pydanticr	   langchain_community.llms.utilsr
   r`   DEFAULT_TASKrA   r   r,   r2   r0   <module>rl      s        / / / / / / / / / / / / E E E E E E 3 3 3 3 3 3       > > > > > >) %Jd d d d dc d d d d dr2   