
    g3                         d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
  e
j        e          Zddd	Zd
 Z G d de          ZdS )z Tokenization classes for BioGPT.    N)ListOptionalTuple   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 ~    t                      }| d         }| dd         D ]}|                    ||f           |}|S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r      N)setadd)wordpairs	prev_charchars       j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/biogpt/tokenization_biogpt.py	get_pairsr   !   sP    
 EEEQIQRR  		9d#$$$		L    c            
           e Zd ZdZeZddgZ	 	 	 	 	 d  fd	Zed	             Z	d
 Z
d Zd Zd Zd!dZd Zd Zd Z	 d"dee         deee                  dee         fdZ	 d#dee         deee                  dedee         f fdZ	 d"dee         deee                  dee         fdZd"dedee         dee         fdZd Zd Z xZS )$BioGptTokenizera:  
    Construct an FAIRSEQ Transformer tokenizer. Moses tokenization followed by Byte-Pair Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
    	input_idsattention_mask<unk><s></s><pad>c           
         	 dd l }	n# t          $ r t          d          w xY wd| _        |	| _        i | _        i | _        	 t          |d          5 }
t          j        |
          | _	        d d d            n# 1 swxY w Y   d | j	        
                                D             | _        t          |d          5 }|                                                    d          d d         }d d d            n# 1 swxY w Y   d	 |D             }t          t          |t!          t#          |                                        | _        i | _         t)                      j        d|||||d
| d S )Nr   zqYou need to install sacremoses to use BioGptTokenizer. See https://pypi.org/project/sacremoses/ for installation.enutf-8encodingc                     i | ]\  }}||	S  r$   ).0kvs      r   
<dictcomp>z,BioGptTokenizer.__init__.<locals>.<dictcomp>x   s    >>>A1>>>r   
c                 `    g | ]+}t          |                                d d                   ,S )N   )tuplesplit)r%   merges     r   
<listcomp>z,BioGptTokenizer.__init__.<locals>.<listcomp>{   s1    ???u%bqb)**???r   )	bos_token	eos_token	sep_token	unk_token	pad_tokenr$   )
sacremosesImportErrorlangsmcache_moses_tokenizercache_moses_detokenizeropenjsonloadencoderitemsdecoderreadr.   dictziprangelen	bpe_rankscachesuper__init__)selfr	   r
   r4   r1   r2   r3   r5   kwargsr6   vocab_handlemerges_handlemerges	__class__s                r   rJ   zBioGptTokenizer.__init__\   s   	 	 	 	M  	 	%'"')$*w/// 	3<9\22DL	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3>>););)=)=>>>+000 	;M"''))//55crc:F	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	;?????c&%F*<*<==>>
 	
	
 	
 	
 	
 	
 	
 	
s,    "A99A= A==0C99C= C=c                 *    t          | j                  S )zReturns vocab size)rF   r?   rK   s    r   
vocab_sizezBioGptTokenizer.vocab_size   s     4<   r   c                 0    t          | j        fi | j        S N)rC   r?   added_tokens_encoderrR   s    r   	get_vocabzBioGptTokenizer.get_vocab   s    DL>>D$=>>>r   c                     || j         vr%| j                            |          }|| j         |<   | j         |                             |ddd          S )Nr8   TF)aggressive_dash_splits
return_strescape)r:   r9   MosesTokenizertokenize)rK   textr8   moses_tokenizers       r   moses_tokenizezBioGptTokenizer.moses_tokenize   sc    t111"g44$4??O/>D&t,)$/88% 9 
 
 	
r   c                     || j         vr%| j                            |          }|| j         |<   | j         |                             |          S )NrY   )r;   r9   MosesDetokenizer
detokenize)rK   tokensr8   moses_detokenizers       r   moses_detokenizez BioGptTokenizer.moses_detokenize   sR    t333 $ 8 8d 8 C C1BD(.+D1<<VDDDr   c                     t          |d d                   |d         dz   fz   }| j        v r j        |         S t          |          }|s|dz   S 	 t          | fd          }| j        vrn8|\  }}g }d}|t          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                     Y nw xY w||         |k    rC|t          |          dz
  k     r-||dz            |k    r|	                    ||z              |dz  }n |	                    ||                    |dz  }|t          |          k     t          |          }|}t          |          dk    rnt          |          }Wd	
                    |          }|d
k    rd}| j        |<   |S )Nr*   </w>Tc                 T    j                             | t          d                    S )Ninf)rG   getfloat)pairrK   s    r   <lambda>z%BioGptTokenizer.bpe.<locals>.<lambda>   s     1C1CD%PU,,1W1W r   keyr   r   r,    z
  </w>z
</w>)r-   rH   r   minrG   rF   indexextend
ValueErrorappendjoin)
rK   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpezBioGptTokenizer.bpe   s   U3B3Z  E"I$6#88DJ:e$$$ 	"6>!	($W$W$W$WXXXFT^++"ME6HAc$ii--

5!,,A
 OOD1I...AA "   OODH---E 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA c$ii--  XHD4yyA~~!$9	(: xx~~:D 
5s   C 'C/.C/Fc                    |r|                                 }n|                     || j                  }g }|D ]L}|rH|                    t	          |                     |                               d                               M|S )zReturns a tokenized string.rr   )r.   ra   r8   ru   listr   )rK   r_   bypass_tokenizersplit_tokensry   s        r   	_tokenizezBioGptTokenizer._tokenize   s     	8::<<DD&&tTY77D 	F 	FE F##D%)>)>s)C)C$D$DEEEr   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)r?   rl   r4   )rK   ry   s     r   _convert_token_to_idz$BioGptTokenizer._convert_token_to_id   s,    |t|'7'7'G'GHHHr   c                 B    | j                             || j                  S )z=Converts an index (integer) in a token (str) using the vocab.)rA   rl   r4   )rK   rt   s     r   _convert_id_to_tokenz$BioGptTokenizer._convert_id_to_token   s    |t~666r   c                     d |D             }d                     |                                          }|                     || j                  }|S )z:Converts a sequence of tokens (string) in a single string.c                 b    g | ],}|                     d d                               dd           -S )rr    ri   )replace)r%   ts     r   r0   z<BioGptTokenizer.convert_tokens_to_string.<locals>.<listcomp>   s6    JJJa!))C$$,,VS99JJJr   r   )rx   r.   rg   r8   )rK   re   r_   s      r   convert_tokens_to_stringz(BioGptTokenizer.convert_tokens_to_string   sO     KJ6JJJ&&(($$VTY77r   Ntoken_ids_0token_ids_1returnc                 B    || j         g|z   S | j         g}||z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BioGPT sequence has the following format:

        - single sequence: `</s> X `
        - pair of sequences: `</s> A </s> B `

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idrK   r   r   seps       r    build_inputs_with_special_tokensz0BioGptTokenizer.build_inputs_with_special_tokens   s;    & %&44 ![ 3&44r   already_has_special_tokensc                     |r$t                                          ||d          S |/dgdgt          |          z  z   dgz   dgt          |          z  z   S dgdgt          |          z  z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr   r   )rI   get_special_tokens_maskrF   )rK   r   r   r   rP   s       r   r   z'BioGptTokenizer.get_special_tokens_mask  s    $ & 	7722'[]a 3    "31#K 0 001QC7A3[AQAQ;QRRsqcC,,,--r   c                     | j         g}|t          ||z             dgz  S t          ||z             dgz  t          ||z             dgz  z   S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
        Transformer sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        Nr   r   )r   rF   r   s       r   $create_token_type_ids_from_sequencesz4BioGptTokenizer.create_token_type_ids_from_sequences  sd    .  ! {S())QC//;$%%+c+2C.D.Ds.JJJr   save_directoryfilename_prefixc           	      z   t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t          |dd	          5 }|                    t          j
        | j        d
dd          dz              d d d            n# 1 swxY w Y   d}t          |dd	          5 }t          | j                                        d           D ][\  }}	||	k    r t                              d| d           |	}|                    d                    |          dz              |dz  }\	 d d d            n# 1 swxY w Y   ||fS )NzVocabulary path (z) should be a directory-r   r	   r
   wr    r!   r,   TF)indent	sort_keysensure_asciir)   r   c                     | d         S )Nr   r$   )kvs    r   ro   z1BioGptTokenizer.save_vocabulary.<locals>.<lambda>J  s    Y[\]Y^ r   rp   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rr   r   )ospathisdirloggererrorrx   VOCAB_FILES_NAMESr<   writer=   dumpsr?   sortedrG   r@   warning)
rK   r   r   r	   
merge_filefrt   writer
bpe_tokenstoken_indexs
             r   save_vocabularyzBioGptTokenizer.save_vocabulary:  sq   w}}^,, 	LLT^TTTUUUFW\\oM_s222QbcoQpp
 

 W\\oM_s222QbcpQqq
 

 *cG444 	cGGDJt|ATYZZZ]aabbb	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c *cG444 		+1$.2F2F2H2HN^N^+_+_+_  '
KK''NNM
 M M M   (ESXXj11D8999
		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 :%%s%   <4C<<D D BF..F25F2c                 B    | j                                         }d |d<   |S )Nr9   )__dict__copy)rK   states     r   __getstate__zBioGptTokenizer.__getstate__V  s#    ""$$dr   c                 h    || _         	 dd l}n# t          $ r t          d          w xY w|| _        d S )Nr   znYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.)r   r6   r7   r9   )rK   dr6   s      r   __setstate__zBioGptTokenizer.__setstate__[  s]    	 	 	 	M  	 s    ()r   r   r   r   r   )FrU   )NF)__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesrJ   propertyrS   rW   ra   rg   r   r   r   r   r   r   intr   r   boolr   r   strr   r   r   r   __classcell__)rP   s   @r   r   r   .   sY       ( (T *$&67 *
 *
 *
 *
 *
 *
X ! ! X!? ? ?
 
 
E E E* * *X   I I I7 7 7   JN5 5953;DI3F5	c5 5 5 52 sx. .9.3;DI3F.ko.	c. . . . . .8 JNK K9K3;DI3FK	cK K K K<& &c &HSM &]bcf]g & & & &8  
      r   r   )r   r=   r   typingr   r   r   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   r   r$   r   r   <module>r      s    ' &  				 ( ( ( ( ( ( ( ( ( ( 5 5 5 5 5 5       
	H	%	%   
 
 
x x x x x) x x x x xr   