
    NgE                        d dl mZ d dlZd dlZd dlZd dlmZ d dlZd dlmZm	Z	 ddl
mZ  ej        e          Z G d de	j                  ZdS )	    )annotationsN)Literal)Tensornn   )WhitespaceTokenizerc                  p     e Zd ZdZi ddfd fdZd dZd!dZd Z	 d"d#dZd Z	d Z
ed             Z xZS )$BoWzImplements a Bag-of-Words (BoW) model to derive sentence embeddings.

    A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
    r   Tvocab	list[str]word_weightsdict[str, float]unknown_word_weightfloatcumulative_term_frequencyboolc                p   t                                                       t          t          |                    }g d| _        || _        || _        || _        || _        g | _	        d}|D ]a}|}||v r	||         }n6|
                                |v r||
                                         }n|dz  }| j	                            |           bt                              | dt          |           d|            t          |t                      d          | _        t          |          | _        d S )N)r   r   r   r   r   r   z out of z0 words without a weighting value. Set weight to F)
stop_wordsdo_lower_case)super__init__listsetconfig_keysr   r   r   r   weightslowerappendloggerinfolenr   	tokenizersentence_embedding_dimension)	selfr   r   r   r   num_unknown_wordswordweight	__class__s	           \/var/www/html/ai-engine/env/lib/python3.11/site-packages/sentence_transformers/models/BoW.pyr   zBoW.__init__   sG    	SZZ  hhh
(#6 )B&  	( 	(D(F|##%d+--%djjll3!Q&!L'''' {{#e**{{fy{{	
 	
 	
 -UsuuTYZZZ,/JJ)))    featuresdict[str, Tensor]c                    |S N )r#   r*   s     r(   forwardzBoW.forward9   s    r)   textsreturn	list[int]c                N      fd|D             }                      |          S )Nc                6    g | ]} j         j        |fi S r.   )r!   tokenize).0textkwargsr#   s     r(   
<listcomp>z BoW.tokenize.<locals>.<listcomp>>   s1    OOO,T^,T<<V<<OOOr)   )get_sentence_features)r#   r0   r8   	tokenizeds   ` ` r(   r5   zBoW.tokenize=   s4    OOOOOOOO	)))444r)   c                    | j         S r-   )r"   r#   s    r(    get_sentence_embedding_dimensionz$BoW.get_sentence_embedding_dimensionA   s    00r)   r   tokenized_textslist[list[int]]pad_seq_lengthint1dict[Literal['sentence_embedding'], torch.Tensor]c                :   g }|D ]}t          j        |                                 t           j                  }|D ]5}| j        r||xx         | j        |         z  cc<   %| j        |         ||<   6|                    |           dt          j        |          iS )N)dtypesentence_embedding)torchzerosr>   float32r   r   r   stack)r#   r?   rA   vectorstokensvectortokens          r(   r:   zBoW.get_sentence_featuresD   s     % 	# 	#F[!F!F!H!HPUP]^^^F 8 81 85MMMT\%%88MMMM$(L$7F5MMNN6""""$ek'&:&:;;r)   c                *      fd j         D             S )Nc                ,    i | ]}|j         |         S r.   )__dict__)r6   keyr#   s     r(   
<dictcomp>z'BoW.get_config_dict.<locals>.<dictcomp>U   s"    DDDCT]3'DDDr)   )r   r=   s   `r(   get_config_dictzBoW.get_config_dictT   s     DDDD43CDDDDr)   c                    t          t          j                            |d          d          5 }t	          j        |                                 |d           d d d            d S # 1 swxY w Y   d S )Nconfig.jsonw   )indent)openospathjoinjsondumprT   )r#   output_pathfOuts      r(   savezBoW.saveW   s    "',,{M::C@@ 	>DId**,,d1====	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	> 	>s   *A&&A*-A*c                    t          t          j                            | d                    5 }t	          j        |          }d d d            n# 1 swxY w Y   t          di |S )NrV   r.   )rZ   r[   r\   r]   r^   loadr
   )
input_pathfInconfigs      r(   rd   zBoW.load[   s    "',,z=99:: 	$cYs^^F	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ 	$ }}V}}s   AAA)r   r   r   r   r   r   r   r   )r*   r+   )r0   r   r1   r2   )r   )r?   r@   rA   rB   r1   rC   )__name__
__module____qualname____doc__r   r/   r5   r>   r:   rT   rb   staticmethodrd   __classcell__)r'   s   @r(   r
   r
      s          *,%&*.!7 !7 !7 !7 !7 !7 !7F   5 5 5 51 1 1 GH< < < < < E E E> > >   \    r)   r
   )
__future__r   r^   loggingr[   typingr   rG   r   r   r!   r   	getLoggerrh   r   Moduler
   r.   r)   r(   <module>rs      s    " " " " " "   				                * * * * * *		8	$	$P P P P P") P P P P Pr)   