
    g                         d Z ddlZddlmZmZ ddlmZ ddlmZ  ej	        e
          ZddiZd	 Z G d
 de          ZdS )zTokenization classes for ESM.    N)ListOptional   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                     t          | d          5 }|                                                                }d |D             cd d d            S # 1 swxY w Y   d S )Nrc                 6    g | ]}|                                 S  )strip).0ls     d/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/esm/tokenization_esm.py
<listcomp>z#load_vocab_file.<locals>.<listcomp>    s     )))a		)))    )openread
splitlines)r   fliness      r   load_vocab_filer      s    	j#		 *!##%%))5)))* * * * * * * * * * * * * * * * * *s   2AAAc            
       $    e Zd ZdZeZddgZ	 	 	 	 	 d fd		Zd
ede	fdZ
de	defdZd Zd Zde	defdZd
ede	fdZ	 ddee         deee                  dee         fdZ	 ddedee         dedee         fdZd Zedefd            Z xZS )EsmTokenizerz&
    Constructs an ESM tokenizer.
    	input_idsattention_mask<unk><cls><pad><mask><eos>c           	      L   t          |          | _        t          t          | j                            | _        d t          | j                  D             | _         t                      j        d|||||d| | j        | _        | 	                    | j                   d S )Nc                     i | ]\  }}||	S r   r   )r   indtoks      r   
<dictcomp>z)EsmTokenizer.__init__.<locals>.<dictcomp>7   s    QQQ(#sS#QQQr   )	unk_token	cls_token	pad_token
mask_token	eos_tokenr   )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)	selfr   r(   r)   r*   r+   r,   kwargs	__class__s	           r   r3   zEsmTokenizer.__init__+   s     **55 4?!;!;<<QQi6P6PQQQ 	
!	
 	
 	
 	
 	
 '+o#$566666r   indexreturnc                 B    | j                             || j                  S Nr0   getr(   r6   r9   s     r   _convert_id_to_tokenz!EsmTokenizer._convert_id_to_tokenG        $$UDN;;;r   tokenc                 r    | j                             || j                             | j                            S r<   r1   r>   r(   r6   rB   s     r   _convert_token_to_idz!EsmTokenizer._convert_token_to_idJ   .     $$UD,=,A,A$.,Q,QRRRr   c                 *    |                                 S r<   )split)r6   textr7   s      r   	_tokenizezEsmTokenizer._tokenizeM   s    zz||r   c                 l    | j                                         }|                    | j                   |S r<   )r1   copyupdateadded_tokens_encoder)r6   
base_vocabs     r   	get_vocabzEsmTokenizer.get_vocabP   s3    &++--
$3444r   c                 r    | j                             || j                             | j                            S r<   rD   rE   s     r   token_to_idzEsmTokenizer.token_to_idU   rG   r   c                 B    | j                             || j                  S r<   r=   r?   s     r   id_to_tokenzEsmTokenizer.id_to_tokenX   rA   r   Ntoken_ids_0token_ids_1c                     | j         g}| j        g}|| j        ||z   S ||z   |z   S | j        t          d          ||z   |z   |z   |z   S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r6   rV   rW   clsseps        r    build_inputs_with_special_tokensz-EsmTokenizer.build_inputs_with_special_tokens[   sv      ! ! ([(([(3..&\]]][ 3&4s::r   Falready_has_special_tokensc                      |r|t          d           fd|D             S dgdgt          |          z  z   dgz   }||dgt          |          z  dgz   z  }|S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.c                 *    g | ]}|j         v rd ndS )   r   )all_special_ids)r   rB   r6   s     r   r   z8EsmTokenizer.get_special_tokens_mask.<locals>.<listcomp>   s*    WWW%$"666AAAWWWr   rb   r   )r[   len)r6   rV   rW   r_   masks   `    r   get_special_tokens_maskz$EsmTokenizer.get_special_tokens_maski   s    $ & 	X& R  
 XWWW;WWWWsqcC,,,-3"QC#k***aS00Dr   c                    t           j                            ||r|dz   nddz             }t          |d          5 }|                    d                    | j                             d d d            n# 1 swxY w Y   |fS )N- r	   w
)ospathjoinr   writer-   )r6   save_directoryfilename_prefixr   r   s        r   save_vocabularyzEsmTokenizer.save_vocabulary   s    W\\.O3c?S3H3Hacgr2rss
*c"" 	0aGGDIIdo..///	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0 	0}s   .A55A9<A9c                 *    t          | j                  S r<   )rd   r-   )r6   s    r   
vocab_sizezEsmTokenizer.vocab_size   s    4?###r   )r   r   r    r!   r"   r<   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr3   intstrr@   rF   rK   rQ   rS   rU   r   r   r^   boolrf   rr   propertyrt   __classcell__)r8   s   @r   r   r   #   s         *$&67
 7 7 7 7 7 78<# <# < < < <S# S# S S S S    
S S S S S S< < < < < < JN; ;9;3;DI3F;	c; ; ; ; in .6tnae	c   >   $C $ $ $ X$ $ $ $ $r   r   )rx   rl   typingr   r   tokenization_utilsr   utilsr   
get_loggerru   loggerry   r   r   r   r   r   <module>r      s    $ # 				 ! ! ! ! ! ! ! ! 5 5 5 5 5 5       
	H	%	%!;/ * * *m$ m$ m$ m$ m$& m$ m$ m$ m$ m$r   