
    Ng2                    ^    d dl mZ d dlZd dlmZ d dlmZmZ d dl	m
Z
  G d de          ZdS )    )annotationsN)Dataset)NLTK_IMPORT_ERRORis_nltk_available)InputExamplec                  D    e Zd ZdZd fddZd Zd Zedd	            Zd
S )DenoisingAutoEncoderDataseta  
    The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
    It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
    sentence without noise.

    Args:
        sentences: A list of sentences
        noise_fn: A noise function: Given a string, it returns a string
            with noise, e.g. deleted words
    c                6    t                               |           S N)r	   delete)ss    v/var/www/html/ai-engine/env/lib/python3.11/site-packages/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py<lambda>z$DenoisingAutoEncoderDataset.<lambda>   s    @[@b@bcd@e@e     	sentences	list[str]c                    t                      s+t          t          j        | j        j                            || _        || _        d S r   )r   ImportErrorr   format	__class____name__r   noise_fn)selfr   r   s      r   __init__z$DenoisingAutoEncoderDataset.__init__   sB     "" 	Q/6t~7NOOPPP" r   c                f    | j         |         }t          |                     |          |g          S )N)texts)r   r   r   )r   itemsents      r   __getitem__z'DenoisingAutoEncoderDataset.__getitem__   s0    ~d#4==#6#6"=>>>>r   c                *    t          | j                  S r   )lenr   )r   s    r   __len__z#DenoisingAutoEncoderDataset.__len__!   s    4>"""r   333333?c                |   ddl m} ddlm}  ||           }t	          |          }|dk    r| S t
          j                            |          |k    }t          |          dk    r"d|t
          j        	                    |          <    |            
                    t          j        |          |                   }|S )Nr   )word_tokenize)TreebankWordDetokenizerT)nltkr%   nltk.tokenize.treebankr&   r!   nprandomrandsumchoice
detokenizearray)text	del_ratior%   r&   wordsnkeep_or_notwords_processeds           r   r   z"DenoisingAutoEncoderDataset.delete%   s    &&&&&&BBBBBBd##JJ66KinnQ'')3{q  /3K	((++,1133>>rx{?[\\r   N)r   r   )r#   )	r   
__module____qualname____doc__r   r   r"   staticmethodr    r   r   r	   r	   
   s~        	 	 7f6e ! ! ! ! !? ? ?# # #    \  r   r	   )
__future__r   numpyr)   torch.utils.datar   transformers.utils.import_utilsr   r   *sentence_transformers.readers.InputExampler   r	   r:   r   r   <module>r@      s    " " " " " "     $ $ $ $ $ $ P P P P P P P P C C C C C C) ) ) ) )' ) ) ) ) )r   