
    Ng_%                        d dl mZ d dlZd dlZd dlmZ d dlZd dlZd dl	m
Z d dl	mZ d dlmZ d dlmZ d dlmZ d d	lmZ  G d
 dej                  ZdS )    )annotationsN)Path)	load_file)	save_file)	Tokenizer)nn)PreTrainedTokenizerFast)get_device_namec                       e Zd Z	 	 d*d+ fd
Zd,dZd-dZd.dZed/d            Zd/dZ	d0d1dZ
d2dZe	 	 	 	 	 d3d4d'            Zed5d)            Z xZS )6StaticEmbeddingN	tokenizer#Tokenizer | PreTrainedTokenizerFastembedding_weightsnp.array | torch.Tensor | Noneembedding_dim
int | NonereturnNonec                   t                                                       t          |t                    r|j        }n$t          |t
                    st          d          |Ut          |t          j                  rt          j
        |          }t          j                            |d          | _        n>|-t          j        |                                |          | _        nt          d          | j        j        | _        | j        j        | _        || _        | j                                         |                    dd          | _        dS )u  
        Initializes the StaticEmbedding model given a tokenizer. The model is a simple embedding bag model that
        takes the mean of trained per-token embeddings to compute text embeddings.

        Args:
            tokenizer (Tokenizer | PreTrainedTokenizerFast): The tokenizer to be used. Must be a fast tokenizer
                from ``transformers`` or ``tokenizers``.
            embedding_weights (np.array | torch.Tensor | None, optional): Pre-trained embedding weights.
                Defaults to None.
            embedding_dim (int | None, optional): Dimension of the embeddings. Required if embedding_weights
                is not provided. Defaults to None.

        Example::

            from sentence_transformers import SentenceTransformer
            from sentence_transformers.models import StaticEmbedding
            from tokenizers import Tokenizer

            # Pre-distilled embeddings:
            static_embedding = StaticEmbedding.from_model2vec("minishlab/M2V_base_output")
            # or distill your own embeddings:
            static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cuda")
            # or start with randomized embeddings:
            tokenizer = Tokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
            static_embedding = StaticEmbedding(tokenizer, embedding_dim=512)

            model = SentenceTransformer(modules=[static_embedding])

            embeddings = model.encode(["What are Pandas?", "The giant panda (Ailuropoda melanoleuca; Chinese: 大熊猫; pinyin: dàxióngmāo), also known as the panda bear or simply the panda, is a bear native to south central China."])
            similarity = model.similarity(embeddings[0], embeddings[1])
            # tensor([[0.9177]]) (If you use the distilled bge-base)

        Raises:
            ValueError: If the tokenizer is not a fast tokenizer.
            ValueError: If neither `embedding_weights` nor `embedding_dim` is provided.
        zThe tokenizer must be fast (i.e. Rust-backed) to use this class. Use Tokenizer.from_pretrained() from `tokenizers` to load a fast tokenizer.NF)freezez?Either `embedding_weights` or `embedding_dim` must be provided.
base_model)super__init__
isinstancer	   
_tokenizerr   
ValueErrornpndarraytorch
from_numpyr   EmbeddingBagfrom_pretrained	embeddingget_vocab_sizenum_embeddingsr   r   
no_paddinggetr   )selfr   r   r   kwargs	__class__s        h/var/www/html/ai-engine/env/lib/python3.11/site-packages/sentence_transformers/models/StaticEmbedding.pyr   zStaticEmbedding.__init__   s9   V 	i!899 	!,IIIy11 	^  
 (+RZ88 H$)$45F$G$G!_<<=NW\<]]DNN&_Y-E-E-G-GWWDNN^___"n;!^9$-!!### !**\488    texts	list[str]dict[str, torch.Tensor]c           	     (   | j                             |d          }d |D             }t          j        t	          j        dgd |d d         D             z                       }t          j        d |D             t          j                  }||d	S )
NF)add_special_tokensc                    g | ]	}|j         
S  )ids).0encodings     r+   
<listcomp>z,StaticEmbedding.tokenize.<locals>.<listcomp>]   s    @@@(@@@r,   r   c                ,    g | ]}t          |          S r3   )len)r5   	token_idss     r+   r7   z,StaticEmbedding.tokenize.<locals>.<listcomp>_   s    3g3g3gyC	NN3g3g3gr,   c                    g | ]	}|D ]}|
S r3   r3   )r5   r:   token_ids      r+   r7   z,StaticEmbedding.tokenize.<locals>.<listcomp>`   s&    !d!d!dyZc!d!dh(!d!d!d!dr,   )dtype)	input_idsoffsets)r   encode_batchr   r    r   cumsumtensorlong)r(   r-   r)   	encodingsencodings_idsr@   r?   s          r+   tokenizezStaticEmbedding.tokenize[   s    N//%/PP	@@i@@@"29aS3g3gTabecebeTf3g3g3g-g#h#hiiL!d!dM!d!d!dlqlvwww	&7;;;r,   featuresc                P    |                      |d         |d                   |d<   |S )Nr?   r@   sentence_embedding)r#   )r(   rH   r)   s      r+   forwardzStaticEmbedding.forwardc   s+    )-8MxXaOb)c)c%&r,   dict[str, float]c                    i S Nr3   r(   s    r+   get_config_dictzStaticEmbedding.get_config_dictg   s    	r,   intc                    t           j        S rN   )mathinfrO   s    r+   max_seq_lengthzStaticEmbedding.max_seq_lengthj   s	    xr,   c                    | j         S rN   )r   rO   s    r+    get_sentence_embedding_dimensionz0StaticEmbedding.get_sentence_embedding_dimensionn   s    !!r,   Tsave_dirstrsafe_serializationboolc                   |rAt          |                                 t          j                            |d                     nEt          j        |                                 t          j                            |d                     | j                            t          t          |          dz                       d S )Nmodel.safetensorspytorch_model.bintokenizer.json)
save_safetensors_file
state_dictospathjoinr   saver   rY   r   )r(   rX   rZ   r)   s       r+   re   zStaticEmbedding.saveq   s     	W!$//"3"3RW\\(L_5`5`aaaaJt(("',,xAT*U*UVVVCX1A ABBCCCCCr,   load_dirc                   t          j        t          t          |           dz                      }t          j                            t          j                            | d                    r.t          t          j                            | d                    }nGt          j
        t          j                            | d          t          j        d          d          }|d         }t          ||          S )	Nr_   r]   r^   cpuT)map_locationweights_onlyzembedding.weight)r   )r   	from_filerY   r   rb   rc   existsrd   load_safetensors_filer   loaddevicer   )rf   r)   r   weightss       r+   rn   zStaticEmbedding.loadx   s    'DNN=M,M(N(NOO	7>>"',,x1DEEFF 	+BGLLCV,W,WXXGGjX':;;%,W\J]J]lp  G ,-yGDDDDr,      
model_name
vocabularylist[str] | Nonero   
str | Nonepca_dims
apply_zipfuse_subwordc                >   	 ddl m} n# t          $ r t          d          w xY wt                      } |||||||          }t	          |j        t          j                  rt          j	        |j                  }	n|j        j
        }	|j        }
 | |
|	|          S )a  
        Creates a StaticEmbedding instance from a distillation process using the `model2vec` package.

        Args:
            model_name (str): The name of the model to distill.
            vocabulary (list[str] | None, optional): A list of vocabulary words to use. Defaults to None.
            device (str): The device to run the distillation on (e.g., 'cpu', 'cuda'). If not specified,
                the strongest device is automatically detected. Defaults to None.
            pca_dims (int | None, optional): The number of dimensions for PCA reduction. Defaults to 256.
            apply_zipf (bool): Whether to apply Zipf's law during distillation. Defaults to True.
            use_subword (bool): Whether to use subword tokenization. Defaults to True.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the distilled model's
                tokenizer and embedding weights.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )distillz\To use this method, please install the `model2vec` package: `pip install model2vec[distill]`)rs   ro   rv   rw   rx   r   r   )model2vec.distillrz   ImportErrorr
   r   r#   r   r   r   r    weightr   )clsrr   rs   ro   rv   rw   rx   rz   static_modelr   r   s              r+   from_distillationz!StaticEmbedding.from_distillation   s    <	1111111 	 	 	n  	
 !""w!!#
 
 
 l,bj99 	> % 01G H H , 6 =+5	s90AjYYYY   	 #model_id_or_pathc                *   	 ddl m} n# t          $ r t          d          w xY w|                    |          }t	          |j        t          j                  rt          j	        |j                  }n|j        j
        }|j        } | |||          S )aH  
        Create a StaticEmbedding instance from a model2vec model. This method loads a pre-trained model2vec model
        and extracts the embedding weights and tokenizer to create a StaticEmbedding instance.

        Args:
            model_id_or_path (str): The identifier or path to the pre-trained model2vec model.

        Returns:
            StaticEmbedding: An instance of StaticEmbedding initialized with the tokenizer and embedding weights
                 the model2vec model.

        Raises:
            ImportError: If the `model2vec` package is not installed.
        r   )StaticModelzSTo use this method, please install the `model2vec` package: `pip install model2vec`r{   )	model2vecr   r}   r"   r   r#   r   r   r   r    r~   r   )r   r   r   r   r   r   s         r+   from_model2veczStaticEmbedding.from_model2vec   s    "	u------- 	u 	u 	usttt	u #223CDDl,bj99 	> % 01G H H , 6 =+5	s90AN^____r   )NN)r   r   r   r   r   r   r   r   )r-   r.   r   r/   )rH   r/   r   r/   )r   rL   )r   rQ   )T)rX   rY   rZ   r[   r   r   )rf   rY   r   r   )NNrq   TT)rr   rY   rs   rt   ro   ru   rv   r   rw   r[   rx   r[   r   r   )r   rY   r   r   )__name__
__module____qualname__r   rG   rK   rP   propertyrU   rW   re   rn   classmethodr   r   __classcell__)r*   s   @r+   r   r      s]        =A$(	F9 F9 F9 F9 F9 F9 F9P< < < <          X" " " "D D D D D	E 	E 	E 	E  (,!" 3Z 3Z 3Z 3Z [3Zj ` ` ` [` ` ` ` `r,   r   )
__future__r   rS   rb   pathlibr   numpyr   r   safetensors.torchr   rm   r   r`   
tokenizersr   r   transformersr	   sentence_transformers.utilr
   Moduler   r3   r,   r+   <module>r      s   " " " " " "  				            @ @ @ @ @ @ @ @ @ @ @ @                   0 0 0 0 0 0 6 6 6 6 6 6D` D` D` D` D`bi D` D` D` D` D`r,   