
    Ng                        d dl mZ d dlmZ d dlmZ d dlZd dlmZmZ d dl	m
Z
mZ d dlmZ  G d d	ej                  ZdS )
    )annotations)Iterable)AnyN)Tensornn)StaticEmbeddingTransformer)SentenceTransformerc                  T     e Zd Z	 dd fd	ZddZddZddZedd            Z xZ	S )GISTEmbedLoss{Gz?modelr
   guidetemperaturefloatreturnNonec                    t                                                       || _        || _        || _        t          j        d          | _        t          |d         t                    rt          |d         t                    st          d          |j        j        |j        j        k    p|j        |j        k     | _        | j        r@| j        j        | _        t          | j        d         t                    rt          d          dS dS )a
  
        This loss is used to train a SentenceTransformer model using the GISTEmbed algorithm.
        It takes a model and a guide model as input, and uses the guide model to guide the
        in-batch negative sample selection. The cosine similarity is used to compute the loss
        and the temperature parameter is used to scale the cosine similarities.

        Args:
            model: SentenceTransformer model based on a `transformers`
                model.
            guide: SentenceTransformer model to guide the in-batch
                negative sample selection.
            temperature: Temperature parameter to scale the cosine
                similarities.

        References:
            - For further details, see: https://arxiv.org/abs/2402.16829

        Requirements:
            1. (anchor, positive, negative) triplets
            2. (anchor, positive) pairs

        Inputs:
            +---------------------------------------+--------+
            | Texts                                 | Labels |
            +=======================================+========+
            | (anchor, positive, negative) triplets | none   |
            +---------------------------------------+--------+
            | (anchor, positive) pairs              | none   |
            +---------------------------------------+--------+

        Recommendations:
            - Use ``BatchSamplers.NO_DUPLICATES`` (:class:`docs <sentence_transformers.training_args.BatchSamplers>`) to
              ensure that no in-batch negatives are duplicates of the anchor or positive samples.

        Relations:
            - :class:`MultipleNegativesRankingLoss` is similar to this loss, but it does not use
              a guide model to guide the in-batch negative sample selection. `GISTEmbedLoss` yields
              a stronger training signal at the cost of some training overhead.

        Example:
            ::

                from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
                from datasets import Dataset

                model = SentenceTransformer("microsoft/mpnet-base")
                guide = SentenceTransformer("all-MiniLM-L6-v2")
                train_dataset = Dataset.from_dict({
                    "anchor": ["It's nice weather outside today.", "He drove to work."],
                    "positive": ["It's so sunny.", "He took the car to the office."],
                })
                loss = losses.GISTEmbedLoss(model, guide)

                trainer = SentenceTransformerTrainer(
                    model=model,
                    train_dataset=train_dataset,
                    loss=loss,
                )
                trainer.train()
        dimr   z_Both the training model and the guiding model must be based on the `transformers` architecture.zIf we must retokenize because the guide model has a different tokenizer, then the Sentence Transformer model must not be based on a StaticEmbedding.N)super__init__r   r   r   r   CosineSimilaritysimilarity_fct
isinstancer	   
ValueError	tokenizervocabmax_seq_lengthmust_retokenizer   )selfr   r   r   	__class__s       f/var/www/html/ai-engine/env/lib/python3.11/site-packages/sentence_transformers/losses/GISTEmbedLoss.pyr   zGISTEmbedLoss.__init__   s   D 	

& 1b999%(K00 	
58[8Y8Y 	q   O!U_%::ie>RUZUi>i 	  	!Z1DN$*Q-99  b  		 	     embed1r   embed2c                z    |                      |                    d          |                    d                    S )N   r   )r   	unsqueeze)r"   r&   r'   s      r$   
sim_matrixzGISTEmbedLoss.sim_matrixe   s4    ""6#3#3A#6#68H8H8K8KLLLr%   sentence_featuresIterable[dict[str, Tensor]]labelsc                V     fd|D             }t          j                    5   j        r* fd|D             } fd|D             } fd|D             } fd|D             }d d d            n# 1 swxY w Y   d }d }t          |          dk    r|\  }}	|\  }
}n?t          |          dk    r|\  }}	}|\  }
}}nt	          dt          |                                          ||	          }                     ||          }                     |	|	          }                     |
|          }                     |
|
          }                     ||          }|                                                    d	d
          }t           j         |||k    <   t           j         |||k    <   t           j         |||k    <   |||g}|U                     ||          }                     |
|          }t           j         |||k    <   |	                    |           t          j
        |d
           j        z  }t          j        |                    d                                                                        |j                  } t#          j                    ||          S )Nc                F    g | ]}                     |          d          S sentence_embedding)r   .0sentence_featurer"   s     r$   
<listcomp>z)GISTEmbedLoss.forward.<locals>.<listcomp>i   s-    sssM]djj!1223GHsssr%   c                T    g | ]$}j                             |d          d          %S )	input_idsT)skip_special_tokens)r   batch_decoder3   s     r$   r6   z)GISTEmbedLoss.forward.<locals>.<listcomp>l   sE       ( N//0@0Mcg/hh  r%   c                D    g | ]}j                             |          S  )r   tokenize)r4   	sentencesr"   s     r$   r6   z)GISTEmbedLoss.forward.<locals>.<listcomp>p   s)    $]$]$]	TZ%8%8%C%C$]$]$]r%   c                P    g | ]"}fd |                                 D             #S )c                V    i | ]%\  }}||                     j        j                  &S r<   )tor   device)r4   keyvaluer"   s      r$   
<dictcomp>z4GISTEmbedLoss.forward.<locals>.<listcomp>.<dictcomp>r   s0    aaa*#uS%((4:#455aaar%   )itemsr3   s     r$   r6   z)GISTEmbedLoss.forward.<locals>.<listcomp>q   sK     % % %( baaaHXH^H^H`H`aaa% % %r%   c                F    g | ]}                     |          d          S r1   )r   r3   s     r$   r6   z)GISTEmbedLoss.forward.<locals>.<listcomp>v   s9          GW

+,,-AB     r%         z Expected 2 or 3 embeddings, got r   r)   r   r   )torchno_gradr!   lenr   r+   diagonalviewinfappendcatr   arangesizelongrA   rB   r   CrossEntropyLoss)r"   r,   r.   
embeddingsdecodedguide_embeddingsnegativenegative_guideanchorpositiveanchor_guidepositive_guideap_simaa_simpp_simguided_ap_simguided_aa_simguided_pp_sim
guided_simscoresan_simguided_an_sims   `                     r$   forwardzGISTEmbedLoss.forwardh   s=   ssssarsss
]__ 	 	# 	   ,=   %^$]$]$]U\$]$]$]!% % % %,=% % %!
       [l     	 	 	 	 	 	 	 	 	 	 	 	 	 	 	  z??a)FH+;(L..__!!)3&FHh;K8L...QJQQRRR 2200844 nEElCCGG #++--222q99
 /4iZ}z)*.3iZ}z)*.3iZ}z)*&&) __VX66F OOL.IIM27)F=:-.MM&!!!6q)))D,<< fkk!nn--224477FF$r"$$VV444s   A A//A36A3dict[str, Any]c                     | j         | j        dS )Nr   r   rl   r"   s    r$   get_config_dictzGISTEmbedLoss.get_config_dict   s    Z+
 
 	
r%   strc                    dS )Na  
@misc{solatorio2024gistembed,
    title={GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
    author={Aivin V. Solatorio},
    year={2024},
    eprint={2402.16829},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}
r<   rm   s    r$   citationzGISTEmbedLoss.citation   s    	 	r%   )r   )r   r
   r   r
   r   r   r   r   )r&   r   r'   r   r   r   )r,   r-   r.   r   r   r   )r   rj   )r   ro   )
__name__
__module____qualname__r   r+   ri   rn   propertyrq   __classcell__)r#   s   @r$   r   r      s        
 "	U U U U U U UnM M M MC5 C5 C5 C5J
 
 
 
 
 
 
 X
 
 
 
 
r%   r   )
__future__r   collections.abcr   typingr   rJ   r   r   sentence_transformers.modelsr   r	   )sentence_transformers.SentenceTransformerr
   Moduler   r<   r%   r$   <module>r}      s    " " " " " " $ $ $ $ $ $                E E E E E E E E I I I I I Iq q q q qBI q q q q qr%   