
    gt                        d Z ddlZddlZddlZddlmZmZmZmZ ddl	Z
ddlmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZ dd
lmZ  e            r
ddlmZmZmZ  e            rddlZ ej        e           Z!dZ" G d d          Z# G d de#          Z$ G d de#          Z% G d de%          Z& G d de%          Z' G d d          Z(dS )z#RAG Retriever model implementation.    N)IterableListOptionalTuple   )PreTrainedTokenizer)BatchEncoding)cached_fileis_datasets_availableis_faiss_availableloggingrequires_backends	strtobool   )	RagConfig)RagTokenizer)Datasetload_datasetload_from_diskzAhttps://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/c                       e Zd ZdZdej        dee         fdZddej        de	ej        ej        f         fdZ
d Zd	 Zd
S )IndexzL
    A base class for the Indices encapsulated by the [`RagRetriever`].
    doc_idsreturnc                     t           )z
        Returns a list of dictionaries, containing titles and text of the retrieved documents.

        Args:
            doc_ids (`np.ndarray` of shape `(batch_size, n_docs)`):
                A tensor of document indices.
        NotImplementedErrorselfr   s     a/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/rag/retrieval_rag.pyget_doc_dictszIndex.get_doc_dicts1   s
     "!       question_hidden_statesc                     t           )a$  
        For each query in the batch, retrieves `n_docs` documents.

        Args:
            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
                An array of query vectors.
            n_docs (`int`):
                The number of docs retrieved per query.

        Returns:
            `np.ndarray` of shape `(batch_size, n_docs)`: A tensor of indices of retrieved documents. `np.ndarray` of
            shape `(batch_size, vector_size)`: A tensor of vector representations of retrieved documents.
        r   )r   r#   n_docss      r   get_top_docszIndex.get_top_docs;   s
     "!r!   c                     t           )zA
        Returns `True` if index is already initialized.
        r   r   s    r   is_initializedzIndex.is_initializedK   s
     "!r!   c                     t           )a
  
        A function responsible for loading the index into memory. Should be called only once per training run of a RAG
        model. E.g. if the model is trained on multiple GPUs in a distributed setup, only one of the workers will load
        the index.
        r   r(   s    r   
init_indexzIndex.init_indexQ   s
     "!r!   Nr"   )__name__
__module____qualname____doc__npndarrayr   dictr    r   r&   r)   r+    r!   r   r   r   ,   s         "RZ "DJ " " " "" "2: "ERTR\^`^hRhLi " " " " " " "" " " " "r!   r   c                       e Zd ZdZdZdZd Zd Zd Zd Z	d Z
d	 Zd
ej        fdZddej        deej        ej        f         fdZdS )LegacyIndexa  
    An index which can be deserialized from the files built using https://github.com/facebookresearch/DPR. We use
    default faiss index parameters as specified in that repository.

    Args:
        vector_size (`int`):
            The dimension of indexed vectors.
        index_path (`str`):
            A path to a *directory* containing index files compatible with [`~models.rag.retrieval_rag.LegacyIndex`]
    z,hf_bert_base.hnswSQ8_correct_phi_128.c_indexzpsgs_w100.tsv.pklc                 ~    g | _         || _        |                                 | _        || _        d | _        d| _        d S )NF)index_id_to_db_id
index_path_load_passagespassagesvector_sizeindex_index_initialized)r   r<   r9   s      r   __init__zLegacyIndex.__init__i   sA    !#$++--&
"'r!   c                 F   t           j                            |          }	 t          ||          }n/# t          $ r" d| d| d| d| d| d}t	          |          w xY w|rt
                              d|            n t
                              d| d|            |S )	NzCan't load 'z'. Make sure that:

- 'zB' is a correct remote path to a directory containing a file named z

- or 'z=' is the correct path to a directory containing a file named z.

zloading file z from cache at )ospathisdirr
   EnvironmentErrorloggerinfo)r   r9   filenameis_localresolved_archive_filemsgs         r   _resolve_pathzLegacyIndex._resolve_pathq   s   7==,,		($/
H$E$E!! 	( 	( 	(rx r r r rdlr r#r rbjr r r 
 #3'''	(  	ZKK?(=??@@@@KKXXXAVXXYYY$$s	   2 ,Ac                 z   t                               d| j                    |                     | j        | j                  }t          t          j                            dd                    st          d          t          |d          5 }t          j        |          }d d d            n# 1 swxY w Y   |S )NLoading passages from TRUST_REMOTE_CODEFalsez  This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially malicious. It's recommended to never unpickle data that could have come from an untrusted source, or that could have been tampered with. If you already verified the pickle data and decided to use it, you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it.rb)rE   rF   r9   rK   PASSAGE_FILENAMEr   rA   environget
ValueErroropenpickleload)r   passages_pathpassages_filer;   s       r   r:   zLegacyIndex._load_passages   s    >T_>>???**4?D<QRR(;WEEFF 	b   -&& 	2-{=11H	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2 	2s   B00B47B4c                 Z   t                               d| j                    |                     | j        | j        dz             }t          j        |          | _        |                     | j        | j        dz             }t          t          j
                            dd                    st          d          t          |d          5 }t          j        |          | _        d d d            n# 1 swxY w Y   t#          | j                  | j        j        k    s
J d            d S )	NLoading index from z
.index.dprz.index_meta.dprrN   rO   rP   rQ   z<Deserialized index_id_to_db_id should match faiss index size)rE   rF   r9   rK   INDEX_FILENAMEfaiss
read_indexr=   r   rA   rS   rT   rU   rV   rW   rX   r8   lenntotal)r   resolved_index_pathresolved_meta_pathmetadata_files       r   _deserialize_indexzLegacyIndex._deserialize_index   sh   ;$/;;<<<"00$BUXdBdee%&9::
!//ATWhAhii(;WEEFF 	b   $d++ 	@}%+[%?%?D"	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ &''4:+<<<<I =<<<<s   C44C8;C8c                     | j         S Nr>   r(   s    r   r)   zLegacyIndex.is_initialized       &&r!   c                     t          j        | j        dz   d          }d|j        _        d|j        _        || _        |                                  d| _        d S )Nr   i         T)	r^   IndexHNSWFlatr<   hnswefSearchefConstructionr=   re   r>   )r   r=   s     r   r+   zLegacyIndex.init_index   sW    #D$4q$8#>>!
$'
!
!!!"&r!   r   c                      g }|D ]1}d |D             } fd|D             }|                     |           2g }|D ]7}i }d |D             |d<   d |D             |d<   |                     |           8|S )Nc                 F    g | ]}t          t          |                    S r4   )strint).0doc_ids     r   
<listcomp>z-LegacyIndex.get_doc_dicts.<locals>.<listcomp>   s&    <<<3s6{{##<<<r!   c                 *    g | ]}j         |         S r4   )r;   ru   rv   r   s     r   rw   z-LegacyIndex.get_doc_dicts.<locals>.<listcomp>   s     <<<fDM&)<<<r!   c                     g | ]
}|d          S )r   r4   ru   docs     r   rw   z-LegacyIndex.get_doc_dicts.<locals>.<listcomp>   s     8 8 8CQ 8 8 8r!   titlec                     g | ]
}|d          S r   r4   r{   s     r   rw   z-LegacyIndex.get_doc_dicts.<locals>.<listcomp>   s    7773A777r!   text)append)r   r   doc_list	doc_ids_iidsdocs	doc_dictsdoc_dicts   `       r   r    zLegacyIndex.get_doc_dicts   s      	" 	"I<<)<<<C<<<<<<<DOOD!!!!	 	' 	'DH 8 84 8 8 8HW77$777HVX&&&&r!   r"   r#   r   c                 b    t          j        t          |          d                              dd          }t          j        ||f          } j                            ||          \  }} fd|D             } fd|D             }t          j        |          t          j        |          fS )Nfloat32)dtyper   c                 ,    g | ]}fd |D             S )c                 n    g | ]1}j                             t          |                    d d         2S )Nr   )r=   reconstructrt   ry   s     r   rw   z7LegacyIndex.get_top_docs.<locals>.<listcomp>.<listcomp>   s8    SSSDJ**3v;;77<SSSr!   r4   ru   r   r   s     r   rw   z,LegacyIndex.get_top_docs.<locals>.<listcomp>   s/    lllX_SSSS7SSSlllr!   c                 ,    g | ]}fd |D             S )c                 D    g | ]}t          j        |                   S r4   )rt   r8   ry   s     r   rw   z7LegacyIndex.get_top_docs.<locals>.<listcomp>.<listcomp>   s)    JJJD*6233JJJr!   r4   r   s     r   rw   z,LegacyIndex.get_top_docs.<locals>.<listcomp>   s.    cccwJJJJ'JJJcccr!   )r1   zerosr`   reshapehstackr=   searcharray)	r   r#   r%   aux_dimquery_nhsw_vectors_docs_idsvectorsr   s	   `        r   r&   zLegacyIndex.get_top_docs   s    (3566iHHHPPQSUVWWY(>'HIIj''(:FCC8llllcklllccccZbcccx}}bhw////r!   Nr,   )r-   r.   r/   r0   r]   rR   r?   rK   r:   re   r)   r+   r1   r   r    r2   r   r&   r4   r!   r   r6   r6   Z   s        	 	 DN*( ( (% % %$  J J J$' ' '' ' 'RX    0 02: 0ERTR\^`^hRhLi 0 0 0 0 0 0r!   r6   c                       e Zd ZddZdefdZd Zd Zdej	        de
e         fd	Zddej	        deej	        ej	        f         fdZdS )HFIndexBaseFc                     || _         || _        || _        |                     |           |                    ddgdd           d S )N)
with_indexnumpy
embeddingsTr   )columnsoutput_all_columnsr   )r<   datasetr>   _check_dataset_format
set_format)r   r<   r   index_initializeds       r   r?   zHFIndexBase.__init__   sW    &"3"".?"@@@7\Nt[deeeeer!   r   c                 t   t          | j        t                    s$t          dt	          | j                             t          h dt          | j        j                  z
            dk    rt          d| j        j                   |r*d| j        	                                vrt          d          d S d S )Nz5Dataset should be a datasets.Dataset object, but got >   r   r}   r   r   zDataset should be a dataset with the following columns: title (str), text (str) and embeddings (arrays of dimension vector_size), but got columns r   zMissing faiss index in the dataset. Make sure you called `dataset.add_faiss_index` to compute it or `dataset.load_faiss_index` to load one from the disk.)

isinstancer   r   	TypeErrortyper`   setcolumn_namesrU   list_indexes)r   r   s     r   r   z!HFIndexBase._check_dataset_format   s    $,00 	jhTXY]YeTfTfhhiii...T\5N1O1OOPPSTTT?#'<#<? ?  
  	,dl.G.G.I.IIIK  	 	IIr!   c                     t                      rg   r   r(   s    r   r+   zHFIndexBase.init_index   s    !###r!   c                     | j         S rg   rh   r(   s    r   r)   zHFIndexBase.is_initialized   ri   r!   r   r   c                 T      fdt          j        d                   D             S )Nc                 Z    g | ]'}j         |                                                  (S r4   )r   tolist)ru   ir   r   s     r   rw   z-HFIndexBase.get_doc_dicts.<locals>.<listcomp>   s0    SSSaWQZ..001SSSr!   r   )rangeshaper   s   ``r   r    zHFIndexBase.get_doc_dicts   s0    SSSSS5qAQ;R;RSSSSr!   r"   r#   c                      j                             d||          \  }} fd|D             }d |D             }t          t          |                    D ]i}t          ||                   |k     rNt	          j        ||         t	          j        |t          ||                   z
   j        f          g          ||<   jt	          j        |          t	          j        |          fS )Nr   c                 >    g | ]}j         d  |D                      S )c                     g | ]
}|d k    |S r   r4   )ru   r   s     r   rw   z7HFIndexBase.get_top_docs.<locals>.<listcomp>.<listcomp>   s    ;;;AAFFaFFFr!   )r   )ru   indicesr   s     r   rw   z,HFIndexBase.get_top_docs.<locals>.<listcomp>   s/    PPP;;;;;<PPPr!   c                     g | ]
}|d          S )r   r4   r{   s     r   rw   z,HFIndexBase.get_top_docs.<locals>.<listcomp>   s    5553|$555r!   )	r   search_batchr   r`   r1   vstackr   r<   r   )r   r#   r%   r   r   r   r   r   s   `       r   r&   zHFIndexBase.get_top_docs   s    **<9OQWXX3PPPPCPPP55555s7||$$ 	m 	mA71:''Y
BHfs7ST:>VX\Xh=i4j4j'kll
x}}bhw////r!   N)Fr,   )r-   r.   r/   r?   boolr   r+   r)   r1   r2   r   r3   r    r   r&   r4   r!   r   r   r      s        f f f f    $ $ $' ' 'TRZ TDJ T T T T0 02: 0ERTR\^`^hRhLi 0 0 0 0 0 0r!   r   c                   d     e Zd ZdZ	 	 	 	 	 	 ddededed	ee         d
ee         f
 fdZd Z xZ	S )CanonicalHFIndexa  
    A wrapper around an instance of [`~datasets.Datasets`]. If `index_path` is set to `None`, we load the pre-computed
    index available with the [`~datasets.arrow_dataset.Dataset`], otherwise, we load the index from the indicated path
    on disk.

    Args:
        vector_size (`int`): the dimension of the passages embeddings used by the index
        dataset_name (`str`, optional, defaults to `wiki_dpr`):
            A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
            with `datasets.list_datasets()`).
        dataset_split (`str`, optional, defaults to `train`)
            Which split of the `dataset` to load.
        index_name (`str`, optional, defaults to `train`)
            The index_name of the index associated with the `dataset`. The index loaded from `index_path` will be saved
            under this name.
        index_path (`str`, optional, defaults to `None`)
            The path to the serialized faiss index on disk.
        use_dummy_dataset (`bool`, optional, defaults to `False`):
            If True, use the dummy configuration of the dataset for tests.
    wiki_dprtrainNFr<   dataset_namedataset_split
index_namer9   c                    t          |d u           t          |d u           z   dk    rt          d          || _        || _        || _        || _        || _        || _        t          	                    d| j                    t          | j        d| j        | j        |          }t                                          ||d           d S )Nr   z,Please provide `index_name` or `index_path`.rM   F)r   splitdummyrevisionr   )rt   rU   r   r   r   r9   use_dummy_datasetdataset_revisionrE   rF   r   superr?   )
r   r<   r   r   r   r9   r   r   r   	__class__s
            r   r?   zCanonicalHFIndex.__init__  s     zT!""St);%<%<<AAKLLL(*$$!2 0@T->@@AAA$(%
 
 
 	gGGGGGr!   c           	         | j         Dt                              d| j                     | j                            d| j                    n|t                              d| j         d| j                    t          | j        dd| j        | j        | j	        | j
                  | _        | j                            ddgd           d| _        d S )	Nr\   r   filez with index name T)with_embeddingsr   r   r   r   r   r   )r   r   )r9   rE   rF   r   load_faiss_indexr   r   r   r   r   r   r   r>   r(   s    r   r+   zCanonicalHFIndex.init_index!  s    ?&KK?do??@@@L)),T_)MMMMKKcd.?ccRVRaccddd'! $(?,.  DL L##Gl^X\#]]]"&r!   )r   r   NNFN)
r-   r.   r/   r0   rt   rs   r   r?   r+   __classcell__r   s   @r   r   r      s         0 '$$($(H HH H 	H
 SMH SMH H H H H H8' ' ' ' ' ' 'r!   r   c                   F     e Zd ZdZddef fdZed             Zd Z xZ	S )CustomHFIndexa  
    A wrapper around an instance of [`~datasets.Datasets`]. The dataset and the index are both loaded from the
    indicated paths on disk.

    Args:
        vector_size (`int`): the dimension of the passages embeddings used by the index
        dataset_path (`str`):
            The path to the serialized dataset on disk. The dataset should have 3 columns: title (str), text (str) and
            embeddings (arrays of dimension vector_size)
        index_path (`str`)
            The path to the serialized faiss index on disk.
    Nr<   c                 b    t                                          |||d u            || _        d S )Nr   )r   r?   r9   )r   r<   r   r9   r   s       r   r?   zCustomHFIndex.__init__B  s2    gtASTTT$r!   c                     t                               d|            ||t          d          t          |          } | |||          S )NrM   zPlease provide `dataset_path` and `index_path` after calling `dataset.save_to_disk(dataset_path)` and `dataset.get_index('embeddings').save(index_path)`.)r<   r   r9   )rE   rF   rU   r   )clsr<   dataset_pathr9   r   s        r   r   zCustomHFIndex.load_from_diskF  sf    ;\;;<<<:#5J   !..s{G
SSSSr!   c                     |                                  sLt                              d| j                    | j                            d| j                   d| _        d S d S )Nr\   r   r   T)r)   rE   rF   r9   r   r   r>   r(   s    r   r+   zCustomHFIndex.init_indexQ  sh    ""$$ 	+KK?do??@@@L)),T_)MMM&*D###	+ 	+r!   rg   )
r-   r.   r/   r0   rt   r?   classmethodr   r+   r   r   s   @r   r   r   4  s         % %C % % % % % % T T [T+ + + + + + +r!   r   c            	       V    e Zd ZdZd fd	Zed             Zedd            Zd Z	d Z
dd	Zd
ededee         fdZdej        dedeej        ej        f         fdZdej        dedeej        ee         f         fdZdefdZ	 	 	 ddeee                  dej        defdZ xZS )RagRetrievera  
    Retriever used to get documents from vector queries. It retrieves the documents embeddings as well as the documents
    contents, and it formats them to be used with a RagModel.

    Args:
        config ([`RagConfig`]):
            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which
            `Index` to build. You can load your own custom dataset with `config.index_name="custom"` or use a canonical
            one (default) from the datasets library with `config.index_name="wiki_dpr"` for example.
        question_encoder_tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer that was used to tokenize the question. It is used to decode the question and then use the
            generator_tokenizer.
        generator_tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer used for the generator part of the RagModel.
        index ([`~models.rag.retrieval_rag.Index`], optional, defaults to the one defined by the configuration):
            If specified, use this index instead of the one built using the configuration

    Examples:

    ```python
    >>> # To load the default "wiki_dpr" dataset with 21M passages from wikipedia (index name is 'compressed' or 'exact')
    >>> from transformers import RagRetriever

    >>> retriever = RagRetriever.from_pretrained(
    ...     "facebook/dpr-ctx_encoder-single-nq-base", dataset="wiki_dpr", index_name="compressed"
    ... )

    >>> # To load your own indexed dataset built with the datasets library. More info on how to build the indexed dataset in examples/rag/use_own_knowledge_dataset.py
    >>> from transformers import RagRetriever

    >>> dataset = (
    ...     ...
    ... )  # dataset must be a datasets.Datasets object with columns "title", "text" and "embeddings", and it must have a faiss index
    >>> retriever = RagRetriever.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", indexed_dataset=dataset)

    >>> # To load your own indexed dataset built with the datasets library that was saved on disk. More info in examples/rag/use_own_knowledge_dataset.py
    >>> from transformers import RagRetriever

    >>> dataset_path = "path/to/my/dataset"  # dataset saved via *dataset.save_to_disk(...)*
    >>> index_path = "path/to/my/index.faiss"  # faiss index saved via *dataset.get_index("embeddings").save(...)*
    >>> retriever = RagRetriever.from_pretrained(
    ...     "facebook/dpr-ctx_encoder-single-nq-base",
    ...     index_name="custom",
    ...     passages_path=dataset_path,
    ...     index_path=index_path,
    ... )

    >>> # To load the legacy index built originally for Rag's paper
    >>> from transformers import RagRetriever

    >>> retriever = RagRetriever.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", index_name="legacy")
    ```NTc                 ^   || _         t          | ddg           t                                                       |p|                     |          | _        || _        || _        |j        | _        |j	        | _
        || _        | j         r|                                  d | _        d| _        d S )Ndatasetsr^   F)_init_retrievalr   r   r?   _build_indexr=   generator_tokenizerquestion_encoder_tokenizerr%   retrieval_batch_size
batch_sizeconfiginit_retrievalctx_encoder_tokenizerreturn_tokenized_docs)r   r   r   r   r=   r   r   s         r   r?   zRagRetriever.__init__  s    -$W 56667d//77
#6 *D'm 5 	"!!!%)"%*"""r!   c           	      :   | j         dk    r!t          | j        | j        pt                    S | j         dk    r,t
                              | j        | j        | j                  S t          | j        | j	        | j
        | j         | j        | j        | j                  S )Nlegacycustom)r<   r   r9   )r<   r   r   r   r9   r   r   )r   r6   retrieval_vector_sizer9   LEGACY_INDEX_PATHr   r   rY   r   r   r   r   r   r   s    r   r   zRagRetriever._build_index  s    ((,!6%6   (** //"8#1!, 0    $"8#^$2!,!,"(":!'!8   r!   c                 D   t          | ddg           |                    dd           pt          j        |fi |}t	          j        ||          }|j        }|j        }|d|_        t          |j	        |          }n| 
                    |          } | ||||          S )Nr   r^   r   r   r   )r   r   r=   )r   popr   from_pretrainedr   question_encoder	generatorr   r   r   r   )	r   retriever_name_or_pathindexed_datasetkwargsr   rag_tokenizerr   r   r=   s	            r   r   zRagRetriever.from_pretrained  s    #
G4555Hd++jy/HI_/j/jci/j/j$45KTZ[[[%2%C"+5& (F!&">PPEE$$V,,Es'A 3	
 
 
 	
r!   c                    t          | j        t                    r| j        j        ^t
          j                            |d          }| j        j        	                    d          
                    |           || j        _        | j        j        t
          j                            |d          }| j        j        j                            d          }| j        j                            |           || j        j        j        d<   || j        _        | j                            |           t!          | j        | j                  }|                    |           d S )Nzhf_dataset_index.faissr   
hf_dataset)r   r   )r   r=   r   r   r9   rA   rB   joinr   	get_indexsaverY   _indexesr   save_to_disksave_pretrainedr   r   r   )r   save_directoryr9   rY   faiss_indexr   s         r   r  zRagRetriever.save_pretrained  s*   dj-00 	:{%-W\\.:RSS

",,\::??
KKK)3&{(0 "^\ J J"j09==lKK
"//>>><G
"+L9,9)##N333$!<.
 
 
 	%%n55555r!   c                 l    t                               d           | j                                         dS )zT
        Retriever initialization function. It loads the index into memory.
        zinitializing retrievalN)rE   rF   r=   r+   r(   s    r   r   zRagRetriever.init_retrieval  s1    
 	,---
r!   c                       fdfdt          t                              D             } j                            | j        j        |dd          }|d         |d         fS )a%  
        Postprocessing retrieved `docs` and combining them with `input_strings`.

        Args:
            docs  (`dict`):
                Retrieved documents.
            input_strings (`str`):
                Input strings decoded by `preprocess_query`.
            prefix (`str`):
                Prefix added at the beginning of each input, typically used with T5-based models.

        Return:
            `tuple(tensors)`: a tuple consisting of two elements: contextualized `input_ids` and a compatible
            `attention_mask`.
        c                     |                      d          r
| dd          } |                     d          r
| d d         } |d}|| z   j        j        z   |z   j        j        z   |z                       dd          }|S )N"r   r    z   )
startswithendswithr   	title_sepdoc_sepreplace)	doc_titledoc_textinput_stringprefixoutr   s        r   cat_input_and_docz8RagRetriever.postprocess_docs.<locals>.cat_input_and_doc  s     ##C(( *%abbM	!!#&& +%crcN	~I%(==H4;K^^ammvvc C Jr!   c           	          g | ]L}t                    D ]:} |         d          |         |         d         |         |                   ;MS )r}   r   )r   )ru   r   jr  r   input_stringsr%   r  s      r   rw   z1RagRetriever.postprocess_docs.<locals>.<listcomp>  s     	
 	
 	
 6]]	
 	
  Q #Q"a 	 	
 	
 	
 	
r!   
max_lengthT)r  return_tensorspadding
truncation	input_idsattention_mask)r   r`   r   batch_encode_plusr   max_combined_length)	r   r   r  r  r%   r  rag_input_stringscontextualized_inputsr  s	   `````   @r   postprocess_docszRagRetriever.postprocess_docs  s    "	 	 	 	 		
 	
 	
 	
 	
 	
 	
 	
 3t99%%	
 	
 	
 !% 8 J J{6)  !K !
 !
 %[13HIY3ZZZr!   t
chunk_sizer   c                 \    fdt          dt                              D             S )Nc                 *    g | ]}||z            S r4   r4   )ru   r   r%  r$  s     r   rw   z.RagRetriever._chunk_tensor.<locals>.<listcomp>  s'    LLL!!a*n$%LLLr!   r   )r   r`   )r   r$  r%  s    ``r   _chunk_tensorzRagRetriever._chunk_tensor  s3    LLLLLuQA
/K/KLLLLr!   r#   r%   c                    |                      || j                  }g }g }|D ]}t          j                    }| j                            ||          \  }}t
                              dt          j                    |z
   d|j                    |                    |           |                    |           t          j
        |          t          j
        |          fS )Nzindex search time: z sec, batch size )r(  r   timer=   r&   rE   debugr   extendr1   r   )	r   r#   r%   question_hidden_states_batchedids_batchedvectors_batched
start_timer   r   s	            r   _main_retrievezRagRetriever._main_retrieve  s    )-););<RTXTc)d)d&&D 	, 	,"J:223I6RRLCLLodikkJ&>ooQgQmoo   s###""7++++H[!!H_%%
 	
r!   c                 n    |                      ||          \  }}||| j                            |          fS )a%  
        Retrieves documents for specified `question_hidden_states`.

        Args:
            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
                A batch of query vectors to retrieve with.
            n_docs (`int`):
                The number of docs retrieved per query.

        Return:
            `Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:

            - **retrieved_doc_embeds** (`np.ndarray` of shape `(batch_size, n_docs, dim)`) -- The retrieval embeddings
              of the retrieved docs per query.
            - **doc_ids** (`np.ndarray` of shape `(batch_size, n_docs)`) -- The ids of the documents in the index
            - **doc_dicts** (`List[dict]`): The `retrieved_doc_embeds` examples per query.
        )r1  r=   r    )r   r#   r%   r   retrieved_doc_embedss        r   retrievezRagRetriever.retrieve/  s>    & )-(;(;<RTZ([([%%#Wdj.F.Fw.O.OOOr!   r   c                 "    || _         d| _        d S NT)r   r   )r   r   s     r   set_ctx_encoder_tokenizerz&RagRetriever.set_ctx_encoder_tokenizerE  s    %:"%)"""r!   question_input_idsc           	         ||n| j         }||n| j        j        j        }|                     ||          \  }}}| j                            |d          }	|                     ||	|||          \  }
}| j        rg }g }t          t          |                    D ]b}t          |          D ]P}|                    ||         d         |                    |                    ||         d         |                    Qc|                     ||dd|          }t          |
||||d	         |d
         d|          S t          |
|||d|          S )a#  
        Retrieves documents for specified `question_hidden_states`.

        Args:
            question_input_ids (`List[List[int]]`) batch of input ids
            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`:
                A batch of query vectors to retrieve with.
            prefix (`str`, *optional*):
                The prefix used by the generator's tokenizer.
            n_docs (`int`, *optional*):
                The number of docs retrieved per query.
            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to "pt"):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.

        Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

            - **context_input_ids** -- List of token ids to be fed to a model.

              [What are input IDs?](../glossary#input-ids)

            - **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model
            (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).

              [What are attention masks?](../glossary#attention-mask)

            - **retrieved_doc_embeds** -- List of embeddings of the retrieved documents
            - **doc_ids** -- List of ids of the retrieved documents
        NT)skip_special_tokens)r  r   r}   longest)r  r  r  r  r  )context_input_idscontext_attention_maskr3  r   tokenized_doc_idstokenized_doc_attention_mask)tensor_type)r<  r=  r3  r   )r%   r   r   r  r4  r   batch_decoder#  r   r   r`   r   r   r	   )r   r8  r#   r  r%   r  r3  r   r   r  r<  r=  retrieved_doc_textretrieved_doc_titleb_idxdoc_idxtokenized_docss                    r   __call__zRagRetriever.__call__J  s   R "-4;!-4;3H3O.2mm<RTZ.[.[+gt7DDEWmqDrr484I4I- 5J 5
 5
11 % &	!#"$s4yy)) N N$V}} N NG&--d5k&.A'.JKKK'..tE{7/CG/LMMMMN "77#"!- 8  N !):.D,@&)7)D4BCS4T  +
 
 
 
 !):.D,@&	  +   r!   r6  rg   )NNN)r-   r.   r/   r0   r?   staticmethodr   r   r   r  r   r#  r   rt   r   r(  r1   r2   r   r1  r3   r4  r   r7  r	   rG  r   r   s   @r   r   r   X  s       3 3j+ + + + + +$   \. 
 
 
 [
$6 6 6(     2[ 2[ 2[ 2[hMx MS MT(^ M M M M
RZ 
 
QVWYWacecmWmQn 
 
 
 
"Prz P3 P5QSQ[]abf]gQgKh P P P P,*?R * * * * X X cOX !#
X 
X X X X X X X Xr!   r   ))r0   rA   rW   r*  typingr   r   r   r   r   r1   tokenization_utilsr   tokenization_utils_baser	   utilsr
   r   r   r   r   r   configuration_ragr   tokenization_ragr   r   r   r   r   r^   
get_loggerr-   rE   r   r   r6   r   r   r   r   r4   r!   r   <module>rP     sY   * ) 				   2 2 2 2 2 2 2 2 2 2 2 2     5 5 5 5 5 5 4 4 4 4 4 4 r r r r r r r r r r r r r r r r ( ( ( ( ( ( * * * * * *  ?>>>>>>>>>> LLL 
	H	%	% X +" +" +" +" +" +" +" +"\h0 h0 h0 h0 h0% h0 h0 h0V'0 '0 '0 '0 '0% '0 '0 '0TB' B' B' B' B'{ B' B' B'J!+ !+ !+ !+ !+K !+ !+ !+HJ J J J J J J J J Jr!   