
    g%                         d Z ddlZddlZddlmZmZmZ ddlmZm	Z	  e            rddl
Z
ddlmZ ddlmZ  ej        e          Zdd	iZd
 Z G d d          Z G d de          ZdS )z Tokenization classes for CPMAnt.    N)ListOptionalTuple)is_jieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                    t          j                    }t          | dd          5 }|                                }ddd           n# 1 swxY w Y   t	          |          D ]\  }}|                    d          }|||<    |S )z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextokens         j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocabr   $   s    #%%E	j#	0	0	0 $F!!##$ $ $ $ $ $ $ $ $ $ $ $ $ $ $!&))  uT""eLs   AAAc                       e Zd ZddZd ZdS )WordpieceTokenizer<unk>   c                 0    || _         || _        || _        d S N)r   	unk_tokenmax_input_chars_per_word)selfr   r%   r&   s       r   __init__zWordpieceTokenizer.__init__0   s    
"(@%%%    c                    t          |          }t          |          | j        k    r| j        gS d}g }|t          |          k     rt          |          }d }||k     r4d                    |||                   }|| j        v r|}n|dz  }||k     4| |                    | j                   |dz  }n|                    |           |}|t          |          k     |S )Nr       )listlenr&   r%   joinr   append)r'   r   charsstart
sub_tokensend
cur_substrsubstrs           r   tokenizezWordpieceTokenizer.tokenize5   s    Uu::555N##
c%jj  e**CJ#++uSy!122TZ''!'Jq #++ !!!$.111
!!*--- c%jj    r)   N)r!   r"   )__name__
__module____qualname__r(   r7    r)   r   r    r    /   s;        A A A A
    r)   r    c            
           e Zd ZdZeZddgZdZ	 	 	 	 	 	 	 	 	 d% fd	Ze	d             Z
e	d             Ze	d             Ze	defd            Zd Zd Z fdZd Zdee         defdZd Zd Zd&dedee         dee         fdZd&d ee         d!ee         dee         fd"Z	 d'd ee         d!eee                  d#edee         f fd$Z xZS )(CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    	input_idsattention_maskF<d></d><s></s><pad>r!   </n></_>leftc                 .   t          | dg           || _        || _        t          |          | _        | j        |	         | j        d<   | j        |         | j        d<   | j        |	= | j        |= t          j        t          | j                                        d                     | _        d | j                                        D             | _	        t          | j        |          | _         t                      j        d	||||||||	|
d	| d S )
Njieba r   c                     | d         S Nr,   r;   xs    r   <lambda>z*CpmAntTokenizer.__init__.<locals>.<lambda>       Z[\]Z^ r)   keyc                     i | ]\  }}||	S r;   r;   ).0kvs      r   
<dictcomp>z,CpmAntTokenizer.__init__.<locals>.<dictcomp>   s    >>>A1>>>r)   )r   r%   )		bod_token	eod_token	bos_token	eos_token	pad_tokenr%   
line_tokenspace_tokenpadding_sider;   )r   rX   rY   r   encoderr   r   sorteditemsdecoderr    wordpiece_tokenizersuperr(   )r'   r   rX   rY   rZ   r[   r\   r%   r]   r^   r_   kwargs	__class__s               r   r(   zCpmAntTokenizer.__init__l   s.    	$	***""!*-- L5S!\*5TL%L$".vdl6H6H6J6JP^P^/_/_/_``>>););)=)=>>>#5DLT]#^#^#^  	
!#%	
 	
 	
 	
 	
 	
 	
r)   c                 &    | j         | j                 S r$   )r`   rX   r'   s    r   bod_token_idzCpmAntTokenizer.bod_token_id       |DN++r)   c                 &    | j         | j                 S r$   )r`   rY   ri   s    r   eod_token_idzCpmAntTokenizer.eod_token_id   rk   r)   c                     | j         d         S )Nr   r`   ri   s    r   
newline_idzCpmAntTokenizer.newline_id   s    |D!!r)   returnc                 *    t          | j                  S r$   )r.   r`   ri   s    r   
vocab_sizezCpmAntTokenizer.vocab_size   s    4<   r)   c                 0    t          | j        fi | j        S r$   )dictr`   added_tokens_encoderri   s    r   	get_vocabzCpmAntTokenizer.get_vocab   s    DL>>D$=>>>r)   c                     g }t          j        |d          D ]/}|                    | j                            |                     0|S )zTokenize a string.F)cut_all)rI   cutextendrd   r7   )r'   textoutput_tokensrN   s       r   	_tokenizezCpmAntTokenizer._tokenize   sU    4/// 	G 	GA  !9!B!B1!E!EFFFFr)   c                 n     d |D             } fd|D             } t                      j        |fi |S )zDecode ids into a string.c                     g | ]
}|d k    |S )r   r;   )rT   is     r   
<listcomp>z+CpmAntTokenizer._decode.<locals>.<listcomp>   s    4441Q!VVQVVVr)   c                 V    g | ]%}|j         k    |j        k    |j        k    #|&S r;   )pad_token_ideos_token_idbos_token_id)rT   rN   r'   s     r   r   z+CpmAntTokenizer._decode.<locals>.<listcomp>   sI     
 
 
A):$:$:qDDU?U?UZ[_c_pZpZpAZpZpZpr)   )re   _decode)r'   	token_idsrf   rg   s   `  r   r   zCpmAntTokenizer._decode   s_    44	444	
 
 
 
 
 
 
	 uwwy33F333r)   c                     || j         v S r$   ro   r'   r   s     r   checkzCpmAntTokenizer.check   s    $$r)   r   c                 ,    d                     |          S )Nr+   )r/   )r'   r   s     r   convert_tokens_to_stringz(CpmAntTokenizer.convert_tokens_to_string   s    wwvr)   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)r`   getr%   r   s     r   _convert_token_to_idz$CpmAntTokenizer._convert_token_to_id   s,    |t|'7'7'G'GHHHr)   c                 B    | j                             || j                  S )z=Converts an index (integer) in a token (str) using the vocab.)rc   r   r%   )r'   r   s     r   _convert_id_to_tokenz$CpmAntTokenizer._convert_id_to_token   s    |t~666r)   Nsave_directoryfilename_prefixc                     t           j                            |          r6t           j                            ||r|dz   ndt          d         z             }n|r|dz   nd|z   }d}d| j        v r| j        d         | j        d<   | j        d= d| j        v r| j        d         | j        d<   | j        d= t          j        t          | j        	                                d	 
                    | _        t          |dd          5 }| j        	                                D ]H\  }}||k    r t                              d| d           |}|                    |dz              |dz  }I	 d d d            n# 1 swxY w Y   |fS )N-r+   r   r   rJ   rF   r   rE   c                     | d         S rL   r;   rM   s    r   rO   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>   rP   r)   rQ   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r,   )ospathisdirr/   VOCAB_FILES_NAMESr`   r   r   ra   rb   r   loggerwarningwrite)r'   r   r   r   r   writerr   token_indexs           r   save_vocabularyzCpmAntTokenizer.save_vocabulary   s   7==(( 	]/!Q3!6!6rUfgsUt t JJ 4CJ/C//n\J$,#'<#4DL S!4<#'<#5DL T"".vdl6H6H6J6JP^P^/_/_/_``*cG444 		&*l&8&8&:&:  "{K''NNN
 N N N   (EUT\***
		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 }s   A#E22E69E6token_ids_0token_ids_1c                 J    || j         g|z   S | j         g|z   | j         gz   |z   S )a1  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `List[int]`: The model input with special tokens.
        )r   )r'   r   r   s      r    build_inputs_with_special_tokensz0CpmAntTokenizer.build_inputs_with_special_tokens   s<     %&44!"[0D4E3FFTTr)   already_has_special_tokensc                     |r$t                                          ||d          S |/dgdgt          |          z  z   dgz   dgt          |          z  z   S dgdgt          |          z  z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`): List of IDs.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr,   r   )re   get_special_tokens_maskr.   )r'   r   r   r   rg   s       r   r   z'CpmAntTokenizer.get_special_tokens_mask   s    " & 	7722'[]a 3    "31#K 0 001QC7A3[AQAQ;QRRsqcC,,,--r)   )	r@   rA   rB   rC   rD   r!   rE   rF   rG   r$   )NF)r8   r9   r:   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer(   propertyrj   rm   rp   intrs   rw   r~   r   r   r   strr   r   r   r   r   r   r   boolr   __classcell__)rg   s   @r   r=   r=   O   s`        0 *$&67
 (
 (
 (
 (
 (
 (
T , , X, , , X, " " X" !C ! ! ! X!? ? ?  4 4 4 4 4% % %tCy S    I I I7 7 7 c HSM ]bcf]g    6U UDI UTXY\T] Uimnqir U U U U& sx. .9.3;DI3F.ko.	c. . . . . . . . . .r)   r=   )r   r   r   typingr   r   r   transformers.utilsr   r   rI   tokenization_utilsr	   utilsr
   
get_loggerr8   r   r   r   r    r=   r;   r)   r   <module>r      s.   ' &     				 ( ( ( ( ( ( ( ( ( ( D D D D D D D D  LLL 5 5 5 5 5 5       
	H	%	%!;/          @|. |. |. |. |.) |. |. |. |. |.r)   