
    Ng                        d dl mZ d dlZd dlmZmZ d dlmZmZ d dl	Z	 ej
        e          Ze G d d                      ZdS )    )annotationsN)	dataclassfield)AnyCallablec                  x    e Zd ZU dZded<    ed           Zded<    eedd	          Zd
ed<   ddZ	ddZ
dS )SentenceTransformerDataCollatora  Collator for a SentenceTransformers model.
    This encodes the text columns to {column}_input_ids and {column}_attention_mask columns.
    This works with the two text dataset that is used as the example in the training overview:
    https://www.sbert.net/docs/sentence_transformer/training_overview.html

    It is important that the columns are in the expected order. For example, if your dataset has columns
    "answer", "question" in that order, then the MultipleNegativesRankingLoss will consider
    "answer" as the anchor and "question" as the positive, and it will (unexpectedly) optimize for
    "given the answer, what is the question?".
    r   tokenize_fnc                 
    ddgS )Nlabelscore r       _/var/www/html/ai-engine/env/lib/python3.11/site-packages/sentence_transformers/data_collator.py<lambda>z(SentenceTransformerDataCollator.<lambda>   s    GWCU r   )default_factory	list[str]valid_label_columnsF)r   initreprzset[tuple[str]]_warned_columnsfeatureslist[dict[str, Any]]returndict[str, torch.Tensor]c                   t          |d                                                   }i }d|v r&|                    d           |d         d         |d<   t          |          | j        vr|                     |           | j        D ]@|v r:t          j        fd|D                       |d<   |                                nA|D ]	                    d          rJd t          d                    |v r0t          j        fd|D             t          j                  |<   a|                     fd|D                       }|                                D ]\  }}|| d	| <   |S )
Nr   dataset_namec                     g | ]
}|         S r   r   ).0rowlabel_columns     r   
<listcomp>z<SentenceTransformerDataCollator.__call__.<locals>.<listcomp>-   s    .U.U.USs</@.U.U.Ur   r   _prompt_lengthc                     g | ]
}|         S r   r   r   r    column_names     r   r"   z<SentenceTransformerDataCollator.__call__.<locals>.<listcomp>4   s    2X2X2X3{3C2X2X2Xr   )dtypec                     g | ]
}|         S r   r   r%   s     r   r"   z<SentenceTransformerDataCollator.__call__.<locals>.<listcomp>7   s    )O)O)Os#k*:)O)O)Or   _)listkeysremovetupler   maybe_warn_about_column_orderr   torchtensorendswithlenintr
   items)	selfr   column_namesbatch	tokenizedkeyvaluer&   r!   s	          @@r   __call__z(SentenceTransformerDataCollator.__call__   s   HQK,,..// \))///$,QK$?E.!d&:::..|<<< !4 	 	L|++!&.U.U.U.UH.U.U.U!V!Vg##L111 ,
 ( 	6 	6K##$455 +F^M]I^I^H^F^:_co:o:o%*\2X2X2X2Xx2X2X2X`e`i%j%j%jk"(()O)O)O)Oh)O)O)OPPI'oo// 6 6
U05,,s,,--6 r   r6   Nonec                   ddddddddddd
}|                                 D ]\  }}||v rx|                    |          |k    r_|dv rg d}n|dv rddg}n|d	v rd
dg}n|dv rg d}t                              d|d|                    |           d| d| d	            n| j                            t          |                     dS )zBWarn the user if the columns are likely not in the expected order.r         )
anchorpositivenegativequestionanswerqueryresponse
hypothesis
entailmentcontradiction)r@   rA   rB   )rC   rD   rC   rD   )rE   rF   rE   rF   )rG   rH   rI   zColumn z is at index z?, whereas a column with this name is usually expected at index a?  . Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns()N)r4   indexloggerwarningr   addr-   )r5   r6   column_name_to_expected_idxr&   expected_idxproposed_fix_columnss         r   r.   z=SentenceTransformerDataCollator.maybe_warn_about_column_order=   sw    '
 '
# *E)J)J)L)L 	 	%Kl**|/A/A+/N/NR^/^/^"DDD+M+M+M(( $:::,6+A(( $999,3Z+@(( $QQQ+X+X+X(Pk P P,:L:L[:Y:Y P PLXP P
 9MP P P     |!4!455555r   N)r   r   r   r   )r6   r   r   r<   )__name__
__module____qualname____doc____annotations__r   r   setr   r;   r.   r   r   r   r	   r	      s         	 	 %*U;U;U%V%V%VVVVV',uSuSX'Y'Y'YOYYYY   @%6 %6 %6 %6 %6 %6r   r	   )
__future__r   loggingdataclassesr   r   typingr   r   r/   	getLoggerrR   rL   r	   r   r   r   <module>r]      s    " " " " " "  ( ( ( ( ( ( ( (                 		8	$	$ U6 U6 U6 U6 U6 U6 U6 U6 U6 U6r   