
    Ng\                        d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
  ej        e          Z G d de          ZdS )    )annotationsN)Dataset)SentenceTransformer)InputExamplec                  Z    e Zd ZdZ	 	 dd dZ	 d!d"dZ	 	 	 d!d#dZd Zd Zd Z	d Z
d ZdS )$ParallelSentencesDatasetu  
    This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
    sentence in different languages. For example, the file can look like this (EN	DE	ES):
    hello world     hallo welt  hola mundo
    second sentence zweiter satz    segunda oración

    The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
    embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
    mapped to this English sentence embedding.

    When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.

    teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
    returns a list of sentence embeddings
       Tstudent_modelr   teacher_model
batch_sizeintuse_embedding_cacheboolc                    || _         || _        g | _        g | _        g | _        g | _        g | _        g | _        || _        || _	        i | _
        d| _        dS )a+  
        Parallel sentences dataset reader to train student model given a teacher model

        Args:
            student_model (SentenceTransformer): The student sentence embedding model that should be trained.
            teacher_model (SentenceTransformer): The teacher model that provides the sentence embeddings for the first column in the dataset file.
            batch_size (int, optional): The batch size for training. Defaults to 8.
            use_embedding_cache (bool, optional): Whether to use an embedding cache. Defaults to True.
        r   N)r
   r   datasetsdatasets_iteratordatasets_tokenizeddataset_indicescopy_dataset_indicescacher   r   embedding_cachenum_sentences)selfr
   r   r   r   s        s/var/www/html/ai-engine/env/lib/python3.11/site-packages/sentence_transformers/datasets/ParallelSentencesDataset.py__init__z!ParallelSentencesDataset.__init__    sg      +*!#"$!$&!
$#6 !    d   N   filepathstrweightmax_sentencesmax_sentence_lengthreturnNonec                   t                               d|z              g }|                    d          rt          j        |dd          nt	          |d          5 }d}|D ]y}|                                                    d          }	|$|dk    rt          d	 |	D                       |k    rO|                    |	           |d
z  }||dk    r||k    r nzddd           n# 1 swxY w Y   | 	                    ||||           dS )a  
        Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column

        Args:
            filepath (str): Filepath to the file.
            weight (int, optional): If more than one dataset is loaded with load_data, specifies the frequency at which data should be sampled from this dataset. Defaults to 100.
            max_sentences (int, optional): Maximum number of lines to be read from the filepath. Defaults to None.
            max_sentence_length (int, optional): Skip the example if one of the sentences has more characters than max_sentence_length. Defaults to 128.

        Returns:
            None
        zLoad z.gzrtutf8)encodingr   	Nc                ,    g | ]}t          |          S  len.0sents     r   
<listcomp>z6ParallelSentencesDataset.load_data.<locals>.<listcomp>[   s    ===4SYY===r      )r!   r"   r#   )
loggerinfoendswithgzipopenstripsplitmaxappendadd_dataset)
r   r   r!   r"   r#   parallel_sentencesfIncountline	sentencess
             r   	load_dataz"ParallelSentencesDataset.load_data=   s     	Gh&'''   ''1DIhv6666h000	 58E   JJLL..t44	'3+a//==9===>>ATTT")))444
 ,1B1BuP]G]G]E%	 	 	 	 	 	 	 	 	 	 	 	 	 	 	& 	v]`s 	 	
 	
 	
 	
 	
s   A?C((C,/C,r>   list[list[str]]c                  	 i 	|D ]}|$|dk    rt          d |D                       |k    r(|d         }|	vrt                      	|<   |D ]}	|                             |           ||dk    rt          	          |k    r nt          	          dk    rd S | xj        t          	fd	D                       z  c_        t          | j                  }| j                            t          		                                                     | j
                            d           | j                            |g|z             d S )Nr   c                ,    g | ]}t          |          S r,   r-   r/   s     r   r2   z8ParallelSentencesDataset.add_dataset.<locals>.<listcomp>s   s    999tT999r   c                :    g | ]}t          |                   S r,   r-   )r0   r1   sentences_maps     r   r2   z8ParallelSentencesDataset.add_dataset.<locals>.<listcomp>   s&    "V"V"V3}T':#;#;"V"V"Vr   )r;   setaddr.   r   sumr   r<   listitemsr   r   extend)
r   r>   r!   r"   r#   rB   source_sentencer1   
dataset_idrH   s
            @r   r=   z$ParallelSentencesDataset.add_datasetg   s    + 	 	I#/'!++99y999::=PPP'lOm3314o.! 9 9o.2248888(]Q->->3}CUCUYfCfCf}""Fc"V"V"V"V"V"V"VWWW''
T-"5"5"7"788999%%a(((##ZL6$9:::::r   c                   g }g }| j         D ]D}|                     |          \  }}|                    |           |                    |           E|                     |          }t	          ||          D ]4\  }}|D ],}| j                            t          |g|                     -5t          j        | j                   d S )N)textslabel)	r   
next_entryr<   get_embeddingszipr   r   randomshuffle)	r   source_sentences_listtarget_sentences_listdata_idxsrc_sentencetrg_sentencessrc_embeddingssrc_embeddingtrg_sentences	            r   generate_dataz&ParallelSentencesDataset.generate_data   s     " ", 	8 	8H*.//(*C*C'L-!((666!((7777 ,,-BCC,/@U,V,V 	[ 	[(M= - [ [
!!,l^="Y"Y"YZZZZ[ 	tz"""""r   c                   | j         |         | j        |                  \  }}| j        |xx         dz  cc<   | j        |         t          | j         |                   k    r)d| j        |<   t          j        | j         |                    ||fS )Nr3   r   )r   r   r.   rW   rX   )r   r[   sourcetarget_sentencess       r   rT   z#ParallelSentencesDataset.next_entry   s    #'=#:4;QRZ;[#\  x(((A-(((!(+s4=3J/K/KKK/0D"8,N4=2333'''r   c                j     j         s# j                            | j        dd          S g }|D ] }| j        vr|                    |           !t          |          dk    rC j                            | j        dd          }t          ||          D ]\  }}| j        |<    fd|D             S )NFT)r   show_progress_barconvert_to_numpyr   c                *    g | ]}j         |         S r,   )r   )r0   r1   r   s     r   r2   z;ParallelSentencesDataset.get_embeddings.<locals>.<listcomp>   s!    AAAt$T*AAAr   )r   r   encoder   r   r<   r.   rV   )r   rB   new_sentencesr1   new_embeddings	embeddings   `     r   rU   z'ParallelSentencesDataset.get_embeddings   s    ' 	%,,doae -   
  	+ 	+D4///$$T***}!!!/66$/Uei 7  N $'}n#E#E 7 7i-6$T**AAAAyAAAAr   c                    | j         S )N)r   )r   s    r   __len__z ParallelSentencesDataset.__len__   s    !!r   c                    t          | j                  dk    r|                                  | j                                        S )Nr   )r.   r   ra   pop)r   idxs     r   __getitem__z$ParallelSentencesDataset.__getitem__   s9    tz??a   z~~r   )r	   T)r
   r   r   r   r   r   r   r   )r   Nr   )
r   r    r!   r   r"   r   r#   r   r$   r%   )r>   rD   r!   r   r"   r   r#   r   )__name__
__module____qualname____doc__r   rC   r=   ra   rT   rU   rn   rr   r,   r   r   r   r      s         ( $(    < gj(
 (
 (
 (
 (
Z !#&"; "; "; "; ";H# # #"( ( (B B B*" " "         r   r   )
__future__r   r7   loggingrW   torch.utils.datar   sentence_transformersr   sentence_transformers.readersr   	getLoggerrs   r4   r   r,   r   r   <module>r}      s    " " " " " "    $ $ $ $ $ $ 5 5 5 5 5 5 6 6 6 6 6 6		8	$	$s  s  s  s  s w s  s  s  s  s r   