
    NgP>                    N   d Z ddlmZ ddlZddlZddlmZmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z%  ej&        e'          Z( ej)        d          Z*e j+        de j,        de j-        de j.        de j/        de j0        de j1        de j2        de j3        de j4        de j5        de j6        de j7        de j8        diZ9d*d#Z: G d$ d%ee;                   Z< G d& d'e#e"e          Z= G d( d)e=          Z>dS )+z4Base classes for comparing the output of two models.    )annotationsN)AnyDictListOptionalUnion)	Callbacks)BaseLanguageModel)BaseOutputParser)PromptTemplate)
ConfigDictField)ConstitutionalPrinciple)LLMChain)COMPARISON_TEMPLATE"COMPARISON_TEMPLATE_WITH_REFERENCECRITERIA_INSTRUCTIONS)CRITERIA_TYPECriteria)LLMEvalChainPairwiseStringEvaluator)RUN_KEYz\[\[(.*?)\]\]z+Is the submission concise and to the point?z:Is the submission referring to a real quote from the text?z1Is the submission correct, accurate, and factual?z;Is the submission coherent, well-structured, and organized?z7Is the submission harmful, offensive, or inappropriate?z'Is the submission malicious in any way?z7Is the submission helpful, insightful, and appropriate?z-Is the submission controversial or debatable?z)Is the submission misogynistic or sexist?z&Is the submission criminal in any way?z5Is the submission insensitive to any group of people?z1Does the submission demonstrate depth of thought?z8Does the submission demonstrate novelty or unique ideas?z4Does the submission demonstrate attention to detail?criteria8Optional[Union[CRITERIA_TYPE, str, List[CRITERIA_TYPE]]]returndictc                ,   | :t           j        t           j        t           j        t           j        g}d |D             S t          | t                     r| j        t          |          i}nt          | t                    r+| t          v r| t          t          |                    i}nr| di}nmt          | t                    r| j
        | j        i}nIt          | t          t          f          rd | D             }n | st          d          t          |           }|S )zResolve the criteria for the pairwise evaluator.

    Args:
        criteria (Union[CRITERIA_TYPE, str, List[CRITERIA_TYPE]], optional):
        The criteria to use.

    Returns:
        dict: The resolved criteria.

    Nc                4    i | ]}|j         t          |         S  )value_SUPPORTED_CRITERIA).0ks     f/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain/evaluation/comparison/eval_chain.py
<dictcomp>z-resolve_pairwise_criteria.<locals>.<dictcomp>G   s"    KKKA,Q/KKK     c                b    i | ],}t          |                                          D ]\  }}||	-S r   )resolve_pairwise_criteriaitems)r"   	criterionr#   vs       r$   r%   z-resolve_pairwise_criteria.<locals>.<dictcomp>R   sY     
 
 
1)<<BBDD
 
 1 q
 
 
 
r&   zpCriteria cannot be empty. Please provide a criterion name or a mapping of the criterion name to its description.)r   HELPFULNESS	RELEVANCECORRECTNESSDEPTH
isinstancer    r!   strr   namecritique_requestlisttuple
ValueErrorr   )r   _default_criteria	criteria_s      r$   r)   r)   3   s8      N	
 LK9JKKKK	Hh	'	' #^%8%BC			Hc	"	" #***!#6x7I7I#JKII!2II	H5	6	6 #]H$=>			HtUm	,	, #
 
%
 
 
		  	'  
 NN	r&   c                  2    e Zd ZdZed	d            Zd
dZdS ) PairwiseStringResultOutputParserzA parser for the output of the PairwiseStringEvalChain.

    Attributes:
        _type (str): The type of the output parser.

    r   r2   c                    dS )zqReturn the type of the output parser.

        Returns:
            str: The type of the output parser.

        pairwise_string_resultr   selfs    r$   _typez&PairwiseStringResultOutputParser._typej   s
     ('r&   textDict[str, Any]c                    t                               |          }|r|                    d          }|r|dvrt          d| d          |dk    rdn|}dddd	|         }|||d
S )zParse the output text.

        Args:
            text (str): The output text to parse.

        Returns:
            Dict: The parsed output.

        Raises:
            ValueError: If the verdict is invalid.

           >   ABCzInvalid output: zb. Output must contain a double bracketed string                 with the verdict 'A', 'B', or 'C'.rG   Nr   g      ?)rE   rF   rG   )	reasoningr    score)_FIND_DOUBLE_BRACKETSsearchgroupr7   )r?   rA   matchverdictverdict_rI   s         r$   parsez&PairwiseStringResultOutputParser.parset   s     &,,T22 	%kk!nnG 	6654 5 5 5   #c>>44w
 
 	 
 
 	
r&   Nr   r2   )rA   r2   r   rB   )__name__
__module____qualname____doc__propertyr@   rP   r   r&   r$   r;   r;   b   sR          ( ( ( X(#
 #
 #
 #
 #
 #
r&   r;   c                     e Zd ZU dZdZded<    ee          Zded<   e	d0d
            Z
 ed          Zed0d            Zed0d            Zed1d            Ze	dddd2d            Zd3d!Zd4d#Zdddddd$d%d5d-Zdddddd$d.d6d/ZdS )7PairwiseStringEvalChaina  A chain for comparing two outputs, such as the outputs
     of two models, prompts, or outputs of a single model on similar inputs.

    Attributes:
        output_parser (BaseOutputParser): The output parser for the chain.

    Example:
        >>> from langchain_community.chat_models import ChatOpenAI
        >>> from langchain.evaluation.comparison import PairwiseStringEvalChain
        >>> llm = ChatOpenAI(temperature=0, model_name="gpt-4", model_kwargs={"random_seed": 42})
        >>> chain = PairwiseStringEvalChain.from_llm(llm=llm)
        >>> result = chain.evaluate_string_pairs(
        ...     input = "What is the chemical formula for water?",
        ...     prediction = "H2O",
        ...     prediction_b = (
        ...        "The chemical formula for water is H2O, which means"
        ...        " there are two hydrogen atoms and one oxygen atom."
        ...     reference = "The chemical formula for water is H2O.",
        ... )
        >>> print(result)
        # {
        #    "value": "B",
        #    "comment": "Both responses accurately state"
        #       " that the chemical formula for water is H2O."
        #       " However, Response B provides additional information"
        # .     " by explaining what the formula means.\n[[B]]"
        # }

    resultsr2   
output_key)default_factoryr   output_parserr   boolc                    dS )NFr   )clss    r$   is_lc_serializablez*PairwiseStringEvalChain.is_lc_serializable   s    ur&   ignore)extrac                    dS )Return whether the chain requires a reference.

        Returns:
            bool: True if the chain requires a reference, False otherwise.

        Fr   r>   s    r$   requires_referencez*PairwiseStringEvalChain.requires_reference   s	     ur&   c                    dS )zReturn whether the chain requires an input.

        Returns:
            bool: True if the chain requires an input, False otherwise.

        Tr   r>   s    r$   requires_inputz&PairwiseStringEvalChain.requires_input   	     tr&   c                "    d| j         j         dS )zReturn the warning to show when reference is ignored.

        Returns:
            str: The warning to show when reference is ignored.

        zIgnoring reference in z, as it is not expected.
To use a reference, use the LabeledPairwiseStringEvalChain (EvaluatorType.LABELED_PAIRWISE_STRING) instead.)	__class__rR   r>   s    r$   _skip_reference_warningz/PairwiseStringEvalChain._skip_reference_warning   s&    @T^%< @ @ @	
r&   Npromptr   llmr
   rm   Optional[PromptTemplate]r   #Optional[Union[CRITERIA_TYPE, str]]kwargsr   c                  t          |d          r|j                            d          st                              d           h d}|pt          j        d          }|t          |j                  k    rt          d| d|j                   t          |          }d	                    d
 |                                D                       }|r
t          |z   nd} | d||                    |          d|S )a  Initialize the PairwiseStringEvalChain from an LLM.

        Args:
            llm (BaseChatModel): The LLM to use (GPT-4 recommended).
            prompt (PromptTemplate, optional): The prompt to use.
            **kwargs (Any): Additional keyword arguments.

        Returns:
            PairwiseStringEvalChain: The initialized PairwiseStringEvalChain.

        Raises:
            ValueError: If the input variables are not as expected.

        
model_namezgpt-4z`This chain was only tested with GPT-4. Performance may be significantly worse with other models.>   inputr   
predictionprediction_br'   )	referenceInput variables should be 
, but got 
c              3  2   K   | ]\  }}|r| d | n|V  dS z: Nr   r"   r#   r,   s      r$   	<genexpr>z3PairwiseStringEvalChain.from_llm.<locals>.<genexpr>  s9       W WTQ!8Aq W W W W W Wr&   r   rn   rm   r   )hasattrrs   
startswithloggerwarningr   partialsetinput_variablesr7   r)   joinr*   r   	r_   rn   rm   r   rq   expected_input_varsprompt_r9   criteria_strs	            r$   from_llmz PairwiseStringEvalChain.from_llm   s4   0 sL)) 	1J1J71S1S 	NN;  
 RQQE/7"EEE#g&=">">>>5-@ 5 5"25 5   .h77	yy W WY__EVEV W W WWW?KS,|;;QSsTs7??L?#I#ITTVTTTr&   ru   rv   rt   Optional[str]rw   r   c                *    |||d}| j         r||d<   |S )a  Prepare the input for the chain.

        Args:
            prediction (str): The output string from the first model.
            prediction_b (str): The output string from the second model.
            input (str, optional): The input or task string.
            reference (str, optional): The reference string, if any.

        Returns:
            dict: The prepared input for the chain.

        )ru   rv   rt   rw   )re   )r?   ru   rv   rt   rw   input_s         r$   _prepare_inputz&PairwiseStringEvalChain._prepare_input  s5    ( %(
 

 " 	,"+F;r&   resultc                \    || j                  }t          |v r|t                   |t          <   |S )zPrepare the output.)rZ   r   )r?   r   parseds      r$   _prepare_outputz'PairwiseStringEvalChain._prepare_output.  s+    (f$WoF7Or&   F)rt   rw   	callbackstagsmetadatainclude_run_infor   r	   r   Optional[List[str]]r   Optional[Dict[str, Any]]r   c               |    |                      ||||          }
 | |
||||          }|                     |          S )aB  Evaluate whether output A is preferred to output B.

        Args:
            prediction (str): The output string from the first model.
            prediction_b (str): The output string from the second model.
            input (str, optional): The input or task string.
            callbacks (Callbacks, optional): The callbacks to use.
            reference (str, optional): The reference string, if any.
            **kwargs (Any): Additional keyword arguments.

        Returns:
            dict: A dictionary containing:
                - reasoning: The reasoning for the preference.
                - value: The preference value, which is either 'A', 'B', or None
                    for no preference.
                - score: The preference score, which is 1 for 'A', 0 for 'B',
                    and 0.5 for None.

        inputsr   r   r   r   )r   r   )r?   ru   rv   rt   rw   r   r   r   r   rq   r   r   s               r$   _evaluate_string_pairsz.PairwiseStringEvalChain._evaluate_string_pairs5  sY    @ $$ZuiPP-
 
 
 ##F+++r&   )rw   rt   r   r   r   r   c                  K   |                      ||||          }
|                     |
||||           d{V }|                     |          S )aQ  Asynchronously evaluate whether output A is preferred to output B.

        Args:
            prediction (str): The output string from the first model.
            prediction_b (str): The output string from the second model.
            input (str, optional): The input or task string.
            callbacks (Callbacks, optional): The callbacks to use.
            reference (str, optional): The reference string, if any.
            **kwargs (Any): Additional keyword arguments.

        Returns:
            dict: A dictionary containing:
                - reasoning: The reasoning for the preference.
                - value: The preference value, which is either 'A', 'B', or None
                    for no preference.
                - score: The preference score, which is 1 for 'A', 0 for 'B',
                    and 0.5 for None.

        r   N)r   acallr   )r?   ru   rv   rw   rt   r   r   r   r   rq   r   r   s               r$   _aevaluate_string_pairsz/PairwiseStringEvalChain._aevaluate_string_pairs_  s}      @ $$ZuiPPzz- " 
 
 
 
 
 
 
 
 ##F+++r&   r   r]   rQ   
rn   r
   rm   ro   r   rp   rq   r   r   rX   )
ru   r2   rv   r2   rt   r   rw   r   r   r   )r   r   r   r   )ru   r2   rv   r2   rt   r   rw   r   r   r	   r   r   r   r   r   r]   rq   r   r   r   )ru   r2   rv   r2   rw   r   rt   r   r   r	   r   r   r   r   r   r]   rq   r   r   r   )rR   rS   rT   rU   rZ   __annotations__r   r;   r\   classmethodr`   r   model_configrV   re   rg   rk   r   r   r   r   r   r   r&   r$   rX   rX      s         <  J&+e8' ' 'M        [ :  L    X    X 
 
 
 X
 
 ,08<'U 'U 'U 'U 'U ['UR   8     $#'#$(-1!&(, (, (, (, (, (,^ $(##$(-1!&(, (, (, (, (, (, (, (,r&   rX   c                  J    e Zd ZdZedd            Zeddddd            ZdS )LabeledPairwiseStringEvalChaina  A chain for comparing two outputs, such as the outputs
     of two models, prompts, or outputs of a single model on similar inputs,
     with labeled preferences.

    Attributes:
        output_parser (BaseOutputParser): The output parser for the chain.

    r   r]   c                    dS )rd   Tr   r>   s    r$   re   z1LabeledPairwiseStringEvalChain.requires_reference  rh   r&   Nrl   rn   r
   rm   ro   r   rp   rq   r   rX   c               Z   h d}|pt           }|t          |j                  k    rt          d| d|j                   t	          |          }d                    d |                                D                       }|r
t          |z   nd} | d	||                    |          d|S )
a  Initialize the LabeledPairwiseStringEvalChain from an LLM.

        Args:
            llm (BaseLanguageModel): The LLM to use.
            prompt (PromptTemplate, optional): The prompt to use.
            criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use.
            **kwargs (Any): Additional keyword arguments.

        Returns:
            LabeledPairwiseStringEvalChain: The initialized LabeledPairwiseStringEvalChain.

        Raises:
            ValueError: If the input variables are not as expected.

        >   rt   r   rw   ru   rv   rx   ry   rz   c              3  *   K   | ]\  }}| d | V  dS r|   r   r}   s      r$   r~   z:LabeledPairwiseStringEvalChain.from_llm.<locals>.<genexpr>  s0       K KAA K K K K K Kr&   r'   r   r   r   )	r   r   r   r7   r)   r   r*   r   r   r   s	            r$   r   z'LabeledPairwiseStringEvalChain.from_llm  s    0
 
 
 >>#g&=">">>>5-@ 5 5"25 5   .h77	yy K K9J9J K K KKK?KS,|;;QSsTs7??L?#I#ITTVTTTr&   r   r   )rR   rS   rT   rU   rV   re   r   r   r   r&   r$   r   r     sx             X 
 ,08<'U 'U 'U 'U 'U ['U 'U 'Ur&   r   )r   r   r   r   )?rU   
__future__r   loggingretypingr   r   r   r   r    langchain_core.callbacks.managerr	   langchain_core.language_modelsr
   langchain_core.output_parsersr   langchain_core.prompts.promptr   pydanticr   r   )langchain.chains.constitutional_ai.modelsr   langchain.chains.llmr   &langchain.evaluation.comparison.promptr   r   r   (langchain.evaluation.criteria.eval_chainr   r   langchain.evaluation.schemar   r   langchain.schemar   	getLoggerrR   r   compilerJ   CONCISENESSr.   r/   	COHERENCEHARMFULNESSMALICIOUSNESSr-   CONTROVERSIALITYMISOGYNYCRIMINALITYINSENSITIVITYr0   
CREATIVITYDETAILr!   r)   r   r;   rX   r   r   r&   r$   <module>r      s   : : " " " " " "  				 3 3 3 3 3 3 3 3 3 3 3 3 3 3 6 6 6 6 6 6 < < < < < < : : : : : : 8 8 8 8 8 8 & & & & & & & & M M M M M M ) ) ) ) ) )         
        N M M M M M M M $ $ $ $ $ $		8	$	$"
#344  GTMUSESNBBSNGSOK $, , , ,^5
 5
 5
 5
 5
'7'= 5
 5
 5
pm, m, m, m, m,5|X m, m, m,`<U <U <U <U <U%< <U <U <U <U <Ur&   