
    ga                         d dl Z d dlZd dlZd dlZd dlmZmZmZ ddlm	Z	m
Z
mZmZ ddlmZ  ej        e          ZdddZd	 Zd
 Zd Zd Z G d d          Z G d de	          ZdS )    N)ListOptionalTuple   )PreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 ~    t                      }| d         }| dd         D ]}|                    ||f           |}|S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r      N)setadd)wordpairs	prev_charchars       l/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/herbert/tokenization_herbert.py	get_pairsr   "   sP    
 EEEQIQRR  		9d#$$$		L    c                 6   |                      dd          } t          j        dd|           } |                      dd          } |                      dd          } |                      dd          } |                      d	d
          } |                      dd
          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      d d!          } |                      d"d#          } |                      d$d%          } |                      d&d'          } |                      d(d)          } |                      d*d+          } |                      d,d-          } t          j        d.d|           } |                      d/d0          } |                      d1d2          } |                      d3d4          } |                      d5d6          } |                      d7d8          } |                      d9d:          } |                      d;d<          } |                      d=d>          } |                      d?d@          } | S )Azz
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
    u   ，,u   。\s*z. u   、u   ”"u   “u   ∶:u   ：u   ？?u   《u   》u   ）)u   ！!u   （(u   ；;u   １1u   」u   「u   ０0u   ３3u   ２2u   ５5u   ６6u   ９9u   ７7u   ８8u   ４4u   ．\s*u   ～~u   ’'u   …z...u   ━-u   〈<u   〉>u   【[u   】]u   ％%)replaceresub)texts    r   replace_unicode_punctr8   0   s    <<s##D6)T4((D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D6)T4((D<<s##D<<s##D<<u%%D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##DKr   c                     g }| D ]A}t          j        |          }|                    d          r,|                    |           Bd                    |          S )zw
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
    C )unicodedatacategory
startswithappendjoin)r7   outputr   cats       r   remove_non_printing_charrC   \   sd     F  "4((>># 	d776??r   c                 ^    |                                  } | sg S |                                 }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)stripsplit)r7   tokenss     r   whitespace_tokenizerH   j   s.    ::<<D 	ZZ\\FMr   c                   L    e Zd ZdZ	 	 	 	 	 ddZddZd ZddZd Zd	 Z	d
 Z
dS )BasicTokenizera  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    TNc                 n    |g }|| _         t          |          | _        || _        || _        || _        d S N)do_lower_caser   never_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfrM   rN   rO   rP   rQ   s         r   __init__zBasicTokenizer.__init__   sE     K*{++&<#* 0r   c                 h   |r'| j                             t          |                    n| j         }|                     |          }| j        r|                     |          }t          j        d|          }t          |          }g }|D ]}||vrV| j	        r3|
                                }| j        dur|                     |          }n| j        r|                     |          }|                    |                     ||                     t          d                    |                    }|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        NFCF )rN   unionr   _clean_textrO   _tokenize_chinese_charsr<   	normalizerH   rM   lowerrP   _run_strip_accentsextend_run_split_on_puncr@   )rR   r7   rN   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r   tokenizezBasicTokenizer.tokenize   sE    CNcd&,,S-=-=>>>SWSc%% & 	6//55D"-"7t"D"D)*ABB  	M 	MEK''% ;!KKMME)66 $ 7 7 > >' ; 33E::E 7 7{ K KLLLL+CHH\,B,BCCr   c                     t          j        d|          }g }|D ]2}t          j        |          }|dk    r|                    |           3d                    |          S )z$Strips accents from a piece of text.NFDMnr;   )r<   rZ   r=   r?   r@   )rR   r7   rA   r   rB   s        r   r\   z!BasicTokenizer._run_strip_accents   si    $UD11 	  	 D&t,,Cd{{MM$wwvr   c                    | j         r|||v r|gS t          |          }d}d}g }|t          |          k     r|||         }t          |          r|                    |g           d}n4|r|                    g            d}|d                             |           |dz  }|t          |          k     |d |D             S )z&Splits punctuation on a piece of text.Nr   TFr   c                 8    g | ]}d                      |          S )r;   )r@   ).0xs     r   
<listcomp>z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>   s"    +++q

+++r   )rQ   listlenr	   r?   )rR   r7   rN   charsistart_new_wordrA   r   s           r   r^   z!BasicTokenizer._run_split_on_punc   s    $ 	)@T[EXEX6MT

#e**nn8Dt$$ (tf%%%!%! &MM"%%%!&r
!!$'''FA #e**nn ,+F++++r   c                 ,   g }|D ]{}t          |          }|                     |          r@|                    d           |                    |           |                    d           f|                    |           |d                    |          S )z)Adds whitespace around any CJK character.rV   r;   )ord_is_chinese_charr?   r@   rR   r7   rA   r   cps        r   rY   z&BasicTokenizer._tokenize_chinese_chars   s     	$ 	$DTB$$R(( $c"""d###c""""d####wwvr   c                     |dk    r|dk    sT|dk    r|dk    sH|dk    r|dk    s<|dk    r|dk    s0|d	k    r|d
k    s$|dk    r|dk    s|dk    r|dk    s|dk    r|dk    rdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF )rR   rw   s     r   ru   zBasicTokenizer._is_chinese_char   s     6\\bFllfvg"--g"--g"--g"--fvg"--4ur   c                    g }|D ]g}t          |          }|dk    s|dk    st          |          r-t          |          r|                    d           R|                    |           hd                    |          S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rV   r;   )rt   r   r
   r?   r@   rv   s        r   rX   zBasicTokenizer._clean_text  s     	$ 	$DTBQww",,+d*;*;,d## $c""""d####wwvr   )TNTNTrL   )__name__
__module____qualname____doc__rS   rd   r\   r^   rY   ru   rX   ry   r   r   rJ   rJ   t   s         0 #1 1 1 1 $ $ $ $L	 	 	, , , ,,    0    r   rJ   c                       e Zd ZdZeZddddddddg d	ddf fd
	Zed             Zd Z	d Z
d Zd Zed             Zd Zd Zd Zd Zd Zd Z	 d#dee         deee                  dee         fdZ	 d$dee         deee                  dedee         f fdZ	 d#dee         deee                  dee         fdZd#dedee         dee         fd Zd! Zd" Z xZ S )%HerbertTokenizera  
    Construct a BPE tokenizer for HerBERT.

    Peculiarities:

    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
      punctuation character will be treated separately.

    - Such pretokenized input is BPE subtokenized

    This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should refer to the
    superclass for more information regarding methods.
    Nz<s>z<unk>z<pad>z<mask>z</s>F)
z
<special0>z
<special1>z
<special2>z
<special3>z
<special4>z
<special5>z
<special6>z
<special7>z
<special8>z
<special9>c                    	 dd l }n# t          $ r t          d          w xY w|| _        i | _        i | _        h d| _        |
| _        || _        || _        |$|"t          |          t          |          k    sJ d | _
        d | _        t          |d          5 }t          j        |          | _        d d d            n# 1 swxY w Y   d | j                                        D             | _        t          |d          5 }|                                                    d          d d         }d d d            n# 1 swxY w Y   d	 |D             }t)          t+          |t-          t          |                                        | _        i | _         t3                      j        d||	||||||||
d d
| t7          d| j        dd          | _        d S )Nr   zrYou need to install sacremoses to use HerbertTokenizer. See https://pypi.org/project/sacremoses/ for installation.>   jathzhutf-8encodingc                     i | ]\  }}||	S ry   ry   )rk   kvs      r   
<dictcomp>z-HerbertTokenizer.__init__.<locals>.<dictcomp>]  s    >>>A1>>>r   
ri   c                 `    g | ]+}t          |                                d d                   ,S )N   )tuplerF   )rk   merges     r   rm   z-HerbertTokenizer.__init__.<locals>.<listcomp>`  s1    ???u%bqb)**???r   )	unk_token	bos_token	sep_token	pad_token	cls_token
mask_tokenadditional_special_tokenslang2idid2langdo_lowercase_and_remove_accenttokenizer_fileF)rM   rN   rO   rP   ry   )
sacremosesImportErrorsmcache_moses_punct_normalizercache_moses_tokenizerlang_with_custom_tokenizerr   r   r   ro   ja_word_tokenizerzh_word_tokenizeropenjsonloadencoderitemsdecoderreadrF   dictziprange	bpe_rankscachesuperrS   rJ   all_special_tokensbert_pre_tokenizer)rR   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargsr   vocab_handlemerges_handlemerges	__class__s                      r   rS   zHerbertTokenizer.__init__&  s   8	 	 	 	M  	  -/)%'"*<*<*<'.L+7#6w<<3w<<////!%!%*w/// 	3<9\22DL	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3>>););)=)=>>>+000 	;M"''))//55crc:F	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	;?????c&%F*<*<==>>
 	
!&?+I	
 	
 	
 	
 	
 #1/#(	#
 #
 #
s,    "CC
C0EE
Ec                     | j         S rL   )r   rR   s    r   rM   zHerbertTokenizer.do_lower_casez  s     22r   c                     || j         vr&| j                            |          }|| j         |<   n| j         |         }|                    |          S )Nlang)r   r   MosesPunctNormalizerrZ   )rR   r7   r   punct_normalizers       r   moses_punct_normz!HerbertTokenizer.moses_punct_norm  s[    t888#w;;;FF6FD-d33#@F))$///r   c                     || j         vr&| j                            |          }|| j         |<   n| j         |         }|                    |dd          S )Nr   F)
return_strescape)r   r   MosesTokenizerrd   )rR   r7   r   moses_tokenizers       r   moses_tokenizezHerbertTokenizer.moses_tokenize  s`    t111"g44$4??O/>D&t,,"8>O''u'MMMr   c                 n    t          |          }|                     ||          }t          |          }|S rL   )r8   r   rC   )rR   r7   r   s      r   moses_pipelinezHerbertTokenizer.moses_pipeline  s6    $T**$$T400'--r   c                 D   | j         	 dd l}|                    dt          j                            d           d          | _         n# t
          t          f$ r t                              d           t                              d           t                              d           t                              d           t                              d	           t                              d
            w xY wt          | j         
                    |                    S )Nr   z-model r,   z/local/share/kytea/model.binzMake sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following stepsz81. git clone git@github.com:neubig/kytea.git && cd kyteaz2. autoreconf -iz#3. ./configure --prefix=$HOME/localz4. make && make installz5. pip install kytea)r   Mykyteaospath
expanduserAttributeErrorr   loggererrorrn   getWS)rR   r7   r   s      r   ja_tokenizezHerbertTokenizer.ja_tokenize  s   !))0Sbg0055SSS* *&& #K0 
 
 
[   WXXX/000BCCC67773444
 D*0066777s   ?A	 	B/C8c                 *    t          | j                  S rL   )ro   r   r   s    r   
vocab_sizezHerbertTokenizer.vocab_size  s     4<   r   c                 0    t          | j        fi | j        S rL   )r   r   added_tokens_encoderr   s    r   	get_vocabzHerbertTokenizer.get_vocab  s    DL>>D$=>>>r   c                     t          |d d                   |d         dz   fz   }| j        v r j        |         S t          |          }|s|dz   S 	 t          | fd          }| j        vrn8|\  }}g }d}|t          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                     Y nw xY w||         |k    rC|t          |          dz
  k     r-||dz            |k    r|	                    ||z              |dz  }n |	                    ||                    |dz  }|t          |          k     t          |          }|}t          |          dk    rnt          |          }Wd	
                    |          }|d
k    rd}| j        |<   |S )Nri   </w>Tc                 T    j                             | t          d                    S )Ninf)r   getfloat)pairrR   s    r   <lambda>z&HerbertTokenizer.bpe.<locals>.<lambda>  s     1C1CD%PU,,1W1W r   keyr   r   r   rV   z
  </w>z
</w>)r   r   r   minr   ro   indexr]   
ValueErrorr?   r@   )
rR   rb   r   r   bigramfirstsecondnew_wordrq   js
   `         r   bpezHerbertTokenizer.bpe  s   U3B3Z  E"I$6#88DJ:e$$$ 	"6>!	($W$W$W$WXXXFT^++"ME6HAc$ii--

5!,,A
 OOD1I...AA "   OODH---E 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA c$ii--  XHD4yyA~~!$9	(: xx~~:D 
5s   C 'C/.C/c                     | j                             |          }g }|D ]L}|rH|                    t          |                     |                              d                               M|S )NrV   )r   rd   r]   rn   r   rF   )rR   r7   
pre_tokensra   rb   s        r   	_tokenizezHerbertTokenizer._tokenize  sr    ,55d;;
 	F 	FE F##D%)>)>s)C)C$D$DEEEr   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)r   r   r   )rR   rb   s     r   _convert_token_to_idz%HerbertTokenizer._convert_token_to_id  s,    |t|'7'7'G'GHHHr   c                 B    | j                             || j                  S )z=Converts an index (integer) in a token (str) using the vocab.)r   r   r   )rR   r   s     r   _convert_id_to_tokenz%HerbertTokenizer._convert_id_to_token  s    |t~666r   c                 |    d                     |                              dd                                          }|S )z:Converts a sequence of tokens (string) in a single string.r;   r   rV   )r@   r4   rE   )rR   rG   
out_strings      r   convert_tokens_to_stringz)HerbertTokenizer.convert_tokens_to_string  s3    WWV__,,VS99??AA
r   token_ids_0token_ids_1returnc                 R    | j         g}| j        g}|||z   |z   S ||z   |z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.

        )bos_token_idsep_token_id)rR   r   r   bosseps        r    build_inputs_with_special_tokensz1HerbertTokenizer.build_inputs_with_special_tokens  sJ    (  ! !$s**[ 3&4s::r   already_has_special_tokensc                     |r$t                                          ||d          S |3dgdgt          |          z  z   dgz   dgt          |          z  z   dgz   S dgdgt          |          z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr   r   )r   get_special_tokens_maskro   )rR   r   r   r   r   s       r   r   z(HerbertTokenizer.get_special_tokens_mask  s    & & 	7722'[]a 3    "31#K 0 001QC7A3[AQAQ;QRVWUXXXsqcC,,,-33r   c                     | j         g}| j        g}|t          ||z   |z             dgz  S t          ||z   |z             dgz  t          ||z             dgz  z   S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        Nr   r   )r   cls_token_idro   )rR   r   r   r   clss        r   $create_token_type_ids_from_sequencesz5HerbertTokenizer.create_token_type_ids_from_sequences8  sw    .  ! !s[(3.//1#553$s*++qc1Cc8I4J4JaS4PPPr   save_directoryfilename_prefixc           	      z   t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t          |dd	          5 }|                    t          j
        | j        d
dd          dz              d d d            n# 1 swxY w Y   d}t          |dd	          5 }t          | j                                        d           D ][\  }}	||	k    r t                              d| d           |	}|                    d                    |          dz              |dz  }\	 d d d            n# 1 swxY w Y   ||fS )NzVocabulary path (z) should be a directoryr.   r;   r   r   wr   r   r   TF)indent	sort_keysensure_asciir   r   c                     | d         S )Nr   ry   )kvs    r   r   z2HerbertTokenizer.save_vocabulary.<locals>.<lambda>f  s    Y[\]Y^ r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rV   r   )r   r   isdirr   r   r@   VOCAB_FILES_NAMESr   writer   dumpsr   sortedr   r   warning)
rR   r  r  r   
merge_filefr   writer
bpe_tokenstoken_indexs
             r   save_vocabularyz HerbertTokenizer.save_vocabularyV  sq   w}}^,, 	LLT^TTTUUUFW\\oM_s222QbcoQpp
 

 W\\oM_s222QbcpQqq
 

 *cG444 	cGGDJt|ATYZZZ]aabbb	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c *cG444 		+1$.2F2F2H2HN^N^+_+_+_  '
KK''NNM
 M M M   (ESXXj11D8999
		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 :%%s%   <4C<<D D BF..F25F2c                 B    | j                                         }d |d<   |S )Nr   )__dict__copy)rR   states     r   __getstate__zHerbertTokenizer.__getstate__s  s#    ""$$dr   c                 h    || _         	 dd l}n# t          $ r t          d          w xY w|| _        d S )Nr   znYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.)r  r   r   r   )rR   dr   s      r   __setstate__zHerbertTokenizer.__setstate__y  s]    	 	 	 	M  	 s    (rL   )NF)!r{   r|   r}   r~   r  vocab_files_namesrS   propertyrM   r   r   r   r   r   r   r   r   r   r   r   r   intr   r   boolr   r  strr   r  r  r   __classcell__)r   s   @r   r   r     s         * ',#
 #
 #
 3R
 R
 R
 R
 R
 R
h 3 3 X30 0 0N N N  8 8 8* ! ! X!? ? ?* * *X  I I I
7 7 7
   JN; ;9;3;DI3F;	c; ; ; ;: sx4 4943;DI3F4ko4	c4 4 4 4 4 4< JNQ Q9Q3;DI3FQ	cQ Q Q Q<& &c &HSM &]bcf]g & & & &:        r   r   )r   r   r5   r<   typingr   r   r   tokenization_utilsr   r   r	   r
   utilsr   
get_loggerr{   r   r  r   r8   rC   rH   rJ   r   ry   r   r   <module>r+     sX    				 				     ( ( ( ( ( ( ( ( ( ( c c c c c c c c c c c c       
	H	%	%   
 
 
( ( (X
 
 
  ^ ^ ^ ^ ^ ^ ^ ^Bo o o o o* o o o o or   