
    g#3                         d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z
 ddlmZ  ej        e          Zdd	d
Zd Z G d de
          ZdS )z Tokenization classes for PhoBERT    N)copyfile)ListOptionalTuple   )PreTrainedTokenizer)loggingz	vocab.txtz	bpe.codes)
vocab_filemerges_filec                     t                      }| d         }| dd         D ]}|                    ||f           |}t          |          }|S )z
    Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    r      N)setadd)wordpairs	prev_charchars       l/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/phobert/tokenization_phobert.py	get_pairsr   #   s[     EEEQIQRR  		9d#$$$		JJEL    c            
           e Zd ZdZeZ	 	 	 	 	 	 	 d fd	Z	 dd	ee         d
e	ee                  dee         fdZ
	 dd	ee         d
e	ee                  dedee         f fdZ	 dd	ee         d
e	ee                  dee         fdZed             Zd Zd Zd Zd Zd Zd Zddede	e         dee         fdZd Z xZS )PhobertTokenizeraO	  
    Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        bos_token (`st`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    <s></s><unk><pad><mask>c
                    || _         || _        i | _        d| j        t          |          <   d| j        t          |          <   d| j        t          |          <   d| j        t          |          <   |                     |           d | j                                        D             | _        t          |d          5 }|                                	                    d          d d	         }d d d            n# 1 swxY w Y   d
 |D             }t          t          |t          t          |                                        | _        i | _         t!                      j        d|||||||	d|
 d S )Nr   r      r   c                     i | ]\  }}||	S  r!   ).0kvs      r   
<dictcomp>z-PhobertTokenizer.__init__.<locals>.<dictcomp>~   s    >>>A1>>>r   utf-8encoding
c                 `    g | ]+}t          |                                d d                   ,S )Nr*   )tuplesplit)r"   merges     r   
<listcomp>z-PhobertTokenizer.__init__.<locals>.<listcomp>   s1    @@@%crc*++@@@r   )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokenr!   )r
   r   encoderstradd_from_fileitemsdecoderopenreadr-   dictziprangelen	bpe_rankscachesuper__init__)selfr
   r   r0   r1   r3   r4   r2   r5   r6   kwargsmerges_handlemerges	__class__s                r   rE   zPhobertTokenizer.__init__f   s    %&'(S^^$'(S^^$'(S^^$'(S^^$:&&&>>););)=)=>>>+000 	;M"''))//55crc:F	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	;@@@@@c&%F*<*<==>>
 		
!		
 		
 		
 		
 		
 		
 		
s   0C==DDNtoken_ids_0token_ids_1returnc                 p    || j         g|z   | j        gz   S | j         g}| j        g}||z   |z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A PhoBERT sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )cls_token_idsep_token_id)rF   rK   rL   clsseps        r    build_inputs_with_special_tokensz1PhobertTokenizer.build_inputs_with_special_tokens   s[    ( %&48I7JJJ ! ![ 3&,{:S@@r   Falready_has_special_tokensc                     |r$t                                          ||d          S |dgdgt          |          z  z   dgz   S dgdgt          |          z  z   ddgz   dgt          |          z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rK   rL   rT   Nr   r   )rD   get_special_tokens_maskrA   )rF   rK   rL   rT   rJ   s       r   rV   z(PhobertTokenizer.get_special_tokens_mask   s    & & 	7722'[]a 3    31#K 0 001QC77sqcC,,,-A61#K@P@P:PQUVTWWWr   c                     | j         g}| j        g}|t          ||z   |z             dgz  S t          ||z   |z   |z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PhoBERT does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )rP   rO   rA   )rF   rK   rL   rR   rQ   s        r   $create_token_type_ids_from_sequencesz5PhobertTokenizer.create_token_type_ids_from_sequences   sm    "  ! !s[(3.//1#553$s*S0;>DEEKKr   c                 *    t          | j                  S N)rA   r7   rF   s    r   
vocab_sizezPhobertTokenizer.vocab_size   s    4<   r   c                 0    t          | j        fi | j        S rZ   )r>   r7   added_tokens_encoderr[   s    r   	get_vocabzPhobertTokenizer.get_vocab   s    DL>>D$=>>>r   c                     | j         v r j         |         S t          |          }t          t          |d d                   |d         dz   gz             }t          |          }|s|S 	 t	          | fd          }| j        vrn8|\  }}g }d}|t          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                     Y nw xY w||         |k    rC|t          |          dz
  k     r-||dz            |k    r|
                    ||z              |dz  }n |
                    ||                    |dz  }|t          |          k     t          |          }|}t          |          dk    rnt          |          }Wd	                    |          }|d d
         }| j         |<   |S )Nr*   z</w>Tc                 T    j                             | t          d                    S )Ninf)rB   getfloat)pairrF   s    r   <lambda>z&PhobertTokenizer.bpe.<locals>.<lambda>   s     1C1CD%PU,,1W1W r   )keyr   r   r   @@ )rC   r,   listr   minrB   rA   indexextend
ValueErrorappendjoin)
rF   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpezPhobertTokenizer.bpe   s#   DJ:e$$U||T$ss)__R6(9'::;;$ 	L	($W$W$W$WXXXFT^++"ME6HAc$ii--

5!,,A
 OOD1I...AA "   OODH---E 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA c$ii--  XHD4yyA~~!$9	(: zz$CRCy 
5s   (C 'DDc                     g }t          j        d|          }|D ]J}|                    t          |                     |                              d                               K|S )zTokenize a string.z\S+\n? )refindallrm   rj   rx   r-   )rF   textsplit_tokenswordsrq   s        r   	_tokenizezPhobertTokenizer._tokenize  sf    
9d++ 	B 	BETXXe__%:%:3%?%? @ @AAAAr   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)r7   rc   r2   )rF   rq   s     r   _convert_token_to_idz%PhobertTokenizer._convert_token_to_id  s,    |t|'7'7'G'GHHHr   c                 B    | j                             || j                  S )z=Converts an index (integer) in a token (str) using the vocab.)r;   rc   r2   )rF   rl   s     r   _convert_id_to_tokenz%PhobertTokenizer._convert_id_to_token!  s    |t~666r   c                 |    d                     |                              dd                                          }|S )z:Converts a sequence of tokens (string) in a single string.rz   rh    )rp   replacestrip)rF   tokens
out_strings      r   convert_tokens_to_stringz)PhobertTokenizer.convert_tokens_to_string%  s5    XXf%%--eR88>>@@
r   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t           j                            | j                  t           j                            |          k    r:t           j        	                    | j                  rt          | j        |           nzt           j        	                    | j                  sVt          |d          5 }| j                                        }|                    |           d d d            n# 1 swxY w Y   t           j                            | j                  t           j                            |          k    rt          | j        |           ||fS )NzVocabulary path (z) should be a directory-r   r
   r   wb)ospathisdirloggererrorrp   VOCAB_FILES_NAMESabspathr
   isfiler   r<   sp_modelserialized_model_protowriter   )rF   r   r   out_vocab_fileout_merge_fileficontent_spiece_models          r   save_vocabularyz PhobertTokenizer.save_vocabulary*  s   w}}^,, 	LLT^TTTUUUFoM_s222QbcoQpp
 
 oM_s222QbcpQqq
 
 7??4?++rw~/N/NNNSUSZSaSabfbqSrSrNT_n555500 	/nd++ /r'+}'K'K'M'M$-.../ / / / / / / / / / / / / / / 7??4+,,0O0OOOT%~666~--s   /FFFc                    t          |t                    rs	 t          |dd          5 }|                     |           ddd           n# 1 swxY w Y   n0# t          $ r}|d}~wt
          $ r t          d| d          w xY wdS |                                }|D ]f}|                                }|	                    d          }|dk    rt          d	          |d|         }t          | j                  | j        |<   gdS )
zi
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        rr&   r'   NzIncorrect encoding detected in z, please rebuild the datasetrz   r*   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer8   r<   r9   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindrn   rA   r7   )	rF   ffdfnfelineslineTmplineidxr   s	            r   r9   zPhobertTokenizer.add_from_fileG  sq    a 	c!S7333 +r&&r***+ + + + + + + + + + + + + + +$   
 c c c a! a a abbbcF 	3 	3G==??D**S//Cbyy !XYYY:D!$T\!2!2DL	3 	3s9   A AA AA AA 
B!A##!B)r   r   r   r   r   r   r   rZ   )NF)__name__
__module____qualname____doc__r   vocab_files_namesrE   r   intr   rS   boolrV   rX   propertyr\   r_   rx   r   r   r   r   r8   r   r   r9   __classcell__)rJ   s   @r   r   r   3   s1       . .` * *
 *
 *
 *
 *
 *
Z JNA A9A3;DI3FA	cA A A A6 sxX X9X3;DI3FXkoX	cX X X X X X: JNL L9L3;DI3FL	cL L L L0 ! ! X!? ? ?* * *X  I I I7 7 7  
. .c .HSM .]bcf]g . . . .:3 3 3 3 3 3 3r   r   )r   r   r{   shutilr   typingr   r   r   tokenization_utilsr   utilsr	   
get_loggerr   r   r   r   r   r!   r   r   <module>r      s     ' & 				 				       ( ( ( ( ( ( ( ( ( ( 5 5 5 5 5 5       
	H	%	%      i3 i3 i3 i3 i3* i3 i3 i3 i3 i3r   