
    Ng3(                        d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	 d dl
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZmZ d dlmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'  e j(        e)          Z* G d de          Z+ edgd          Z, edgd          Z- G d deee.                            Z/ G d de          Z0dS )    N)AnyListOptional)LLMChain)ConditionalPromptSelector)#AsyncCallbackManagerForRetrieverRunCallbackManagerForRetrieverRun)Document)BaseLLM)BaseOutputParser)BasePromptTemplatePromptTemplate)BaseRetriever)VectorStore)RecursiveCharacterTextSplitterTextSplitter)	BaseModelField)AsyncHtmlLoader)Html2TextTransformer)LlamaCpp)GoogleSearchAPIWrapperc                   D    e Zd ZU dZ edd          Zee         ed<   dS )SearchQueriesz/Search queries to research for the user's goal..z+List of search queries to look up on GoogledescriptionqueriesN)	__name__
__module____qualname____doc__r   r   r   str__annotations__     g/var/www/html/ai-engine/env/lib/python3.11/site-packages/langchain_community/retrievers/web_research.pyr   r      sM         99F  GT#Y     r%   r   questiona!  <<SYS>> 
 You are an assistant tasked with improving Google search results. 
 <</SYS>> 

 [INST] Generate THREE Google search queries that are similar to this question. The output should be a numbered list of questions and each should have a question mark at the end: 

 {question} [/INST])input_variablestemplatezYou are an assistant tasked with improving Google search results. Generate THREE Google search queries that are similar to this question. The output should be a numbered list of questions and each should have a question mark at the end: {question}c                   .    e Zd ZdZdedee         fdZdS )QuestionListOutputParserz/Output parser for a list of numbered questions.textreturnc                 0    t          j        d|          }|S )Nz\d+\..*?(?:\n|$))refindall)selfr,   liness      r&   parsezQuestionListOutputParser.parse8   s    
.55r%   N)r   r   r    r!   r"   r   r3   r$   r%   r&   r+   r+   5   s@        99# $s)      r%   r+   c                       e Zd ZU dZ edd          Zeed<   eed<    edd          Z	e
ed<    ed	d
          Zeed<    e edd          d          Zeed<    eed          Zee         ed<    edd          Zeed<   dZeed<   	 deddf fdZedd	 edd          ddfdedede
dee         dededededd fd            Zd edefd!Zd&d ededee         fd"Z d ed#e!dee"         fd$Z#d ed#e$dee"         fd%Z% xZ&S )'WebResearchRetrieverz`Google Search API` retriever..z"Vector store for storing web pagesr   vectorstore	llm_chainzGoogle Search API Wrappersearch   z!Number of pages per Google searchnum_search_resultsi  2   )
chunk_sizechunk_overlapz1Text splitter for splitting web pages into chunkstext_splitterzList of processed URLs)default_factoryr   url_databaseFz_Whether to use the http_proxy/https_proxy env variables or check .netrc for proxy configuration	trust_envallow_dangerous_requestskwargsr-   Nc                     |                     dd          }|st          d           t                      j        di | dS )zInitialize the retriever.rB   Fa  WebResearchRetriever crawls URLs surfaced through the provided search engine. It is possible that some of those URLs will end up pointing to machines residing on an internal network, leadingto an SSRF (Server-Side Request Forgery) attack. To protect yourself against that risk, you can run the requests through a proxy and prevent the crawler from accidentally crawling internal resources.If've taken the necessary precautions, you can set `allow_dangerous_requests` to `True`.Nr$   )get
ValueErrorsuper__init__)r1   rC   rB   	__class__s      r&   rH   zWebResearchRetriever.__init___   s\    #)::.H%#P#P ' 		8   	""6"""""r%      llmpromptc	           	          |s4t          t          d t          fg          }	|	                    |          }t	          ||t                                }
 | ||
|||||          S )a  Initialize from llm using default template.

        Args:
            vectorstore: Vector store for storing web pages
            llm: llm for search question generation
            search: GoogleSearchAPIWrapper
            prompt: prompt to generating search questions
            num_search_results: Number of pages per Google search
            text_splitter: Text splitter for splitting web pages into chunks
            trust_env: Whether to use the http_proxy/https_proxy env variables
                or check .netrc for proxy configuration
            allow_dangerous_requests: A flag to force users to acknowledge
                the risks of SSRF attacks when using this retriever

        Returns:
            WebResearchRetriever
        c                 ,    t          | t                    S N)
isinstancer   )rK   s    r&   <lambda>z/WebResearchRetriever.from_llm.<locals>.<lambda>   s    C!:!: r%   )default_promptconditionals)rK   rL   output_parser)r6   r7   r8   r:   r>   rA   rB   )r   DEFAULT_SEARCH_PROMPTDEFAULT_LLAMA_SEARCH_PROMPT
get_promptr   r+   )clsr6   rK   r8   rL   r:   r>   rA   rB   QUESTION_PROMPT_SELECTORr7   s              r&   from_llmzWebResearchRetriever.from_llmq   s    @  	>'@4::<WX( ( ($ .88==F 244
 
 
	 s#1'%=
 
 
 	
r%   queryc                     |d                                          rG|                    d          }|dk    r,||dz   d          }|                    d          r
|d d         }|                                S )Nr   "r9   )isdigitfindendswithstrip)r1   r[   first_quote_poss      r&   clean_search_queryz'WebResearchRetriever.clean_search_query   sz    
 8 	'#jjooO"$$o1334>>#&& '!#2#JE{{}}r%   c                 f    |                      |          }| j                            ||          }|S )z3Returns num_search_results pages per Google search.)rd   r8   results)r1   r[   r:   query_cleanresults        r&   search_toolz WebResearchRetriever.search_tool   s2    --e44$$[2DEEr%   run_managerc                d   t                               d           |                     d|i          }t                               d|            |d         }t                               d|            t                               d           g }|D ]}|                     || j                  }t                               d           t                               d|            |D ]3}|                    dd	          r|                    |d                    4t          |          }t          |	                    | j
                            }	t                               d
|	            |	rt          |	d| j                  }
t                      }t                               d           |
                                }t          |                    |                    }| j                            |          }| j                            |           | j
                            |	           t                               d           g }|D ]/}|                    | j                            |                     0d |D             }t          |                                          }|S )zSearch Google for documents related to the query input.

        Args:
            query: user query

        Returns:
            Relevant documents from all various urls.
        z*Generating questions for Google Search ...r'   z#Questions for Google Search (raw): r,   zQuestions for Google Search: zSearching for relevant urls...zSearch results: linkNzNew URLs to load: T)ignore_load_errorsrA   zIndexing new urls...z*Grabbing most relevant splits from urls...c           	          i | ]=}|j         t          t          |j                                                            f|>S r$   )page_contenttuplesortedmetadataitems).0docs     r&   
<dictcomp>z@WebResearchRetriever._get_relevant_documents.<locals>.<dictcomp>   sM     !
 !
 !
MPSuVCL,>,>,@,@%A%ABBCS!
 !
 !
r%   )loggerinfor7   ri   r:   rE   appendsetlist
differencer@   r   rA   r   loadtransform_documentsr>   split_documentsr6   add_documentsextendsimilarity_searchvalues)r1   r[   rj   rh   	questionsurls_to_looksearch_resultsresurlsnew_urlsloader	html2textdocsunique_documents_dictunique_documentss                  r&   _get_relevant_documentsz,WebResearchRetriever._get_relevant_documents   s     	@AAAU 344B&BBCCC6N	?I??@@@ 	4555 	5 	5E!--eT5LMMNKK8999KK;>;;<<<% 5 57764(( 5 ''F4445
 <   (9::;;333444 
	/$TT^  F -..IKK.///;;==D	55d;;<<D%55d;;D**4000$$X... 	@AAA 	C 	CEKK(::5AABBBB!
 !
TX!
 !
 !
   5 < < > >??r%   c                   K   t           rO   )NotImplementedError)r1   r[   rj   s      r&   _aget_relevant_documentsz-WebResearchRetriever._aget_relevant_documents  s       "!r%   )r9   )'r   r   r    r!   r   r6   r   r#   r   r8   r   r:   intr   r>   r   r{   r@   r   r"   rA   boolrB   r   rH   classmethodr   r   r   rZ   rd   dictri   r	   r
   r   r   r   __classcell__)rI   s   @r&   r5   r5   =   s        ((  %u=     K    %*U3<W%X%X%XF"XXX#eA3VWWWWWW"'%&&$bIIIG# # #M<    $e*B  L$s)    e/  It    &+d***# # # # # # # #$  04"#8V8V39
 9
 9
  ).7
 7
 7
 7
 '	7

 +,7
  7
 67
 7
 #'7
 
 7
 7
 7
 [7
r        # d4j    B B  4	B 
 
hB  B  B  B H"" 9	"
 
h" " " " " " " "r%   r5   )1loggingr/   typingr   r   r   langchain.chainsr    langchain.chains.prompt_selectorr   langchain_core.callbacksr   r	   langchain_core.documentsr
   langchain_core.language_modelsr   langchain_core.output_parsersr   langchain_core.promptsr   r   langchain_core.retrieversr   langchain_core.vectorstoresr   langchain_text_splittersr   r   pydanticr   r   $langchain_community.document_loadersr   )langchain_community.document_transformersr   langchain_community.llmsr   langchain_community.utilitiesr   	getLoggerr   rw   r   rV   rU   r"   r+   r5   r$   r%   r&   <module>r      sb    				 & & & & & & & & & & % % % % % % F F F F F F        . - - - - - 2 2 2 2 2 2 : : : : : : E E E E E E E E 3 3 3 3 3 3 3 3 3 3 3 3 Q Q Q Q Q Q Q Q % % % % % % % % @ @ @ @ @ @ J J J J J J - - - - - - @ @ @ @ @ @		8	$	$    I    -nLL    'L6       /S	:   N" N" N" N" N"= N" N" N" N" N"r%   