
    g!K                         d Z ddlZddlZddlZddlZddlmZmZmZm	Z	 ddl
mZ ddlmZ  ej        e          Zddd	d
Zd Zd Zd Z G d de          ZdS )zTokenization classes for FSMT.    N)DictListOptionalTuple   )PreTrainedTokenizer)loggingzvocab-src.jsonzvocab-tgt.jsonz
merges.txt)src_vocab_filetgt_vocab_filemerges_filec                 ~    t                      }| d         }| dd         D ]}|                    ||f           |}|S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r      N)setadd)wordpairs	prev_charchars       f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/fsmt/tokenization_fsmt.py	get_pairsr   $   sP    
 EEEQIQRR  		9d#$$$		L    c                 6   |                      dd          } t          j        dd|           } |                      dd          } |                      dd          } |                      dd          } |                      d	d
          } |                      dd
          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      d d!          } |                      d"d#          } |                      d$d%          } |                      d&d'          } |                      d(d)          } |                      d*d+          } |                      d,d-          } t          j        d.d|           } |                      d/d0          } |                      d1d2          } |                      d3d4          } |                      d5d6          } |                      d7d8          } |                      d9d:          } |                      d;d<          } |                      d=d>          } |                      d?d@          } | S )Azz
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
    u   ，,u   。\s*z. u   、u   ”"u   “u   ∶:u   ：u   ？?u   《u   》u   ）)u   ！!u   （(u   ；;u   １1u   」u   「u   ０0u   ３3u   ２2u   ５5u   ６6u   ９9u   ７7u   ８8u   ４4u   ．\s*u   ～~u   ’'u   …z...u   ━-u   〈<u   〉>u   【[u   】]u   ％%)replaceresub)texts    r   replace_unicode_punctr7   1   s    <<s##D6)T4((D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##D6)T4((D<<s##D<<s##D<<u%%D<<s##D<<s##D<<s##D<<s##D<<s##D<<s##DKr   c                     g }| D ]A}t          j        |          }|                    d          r,|                    |           Bd                    |          S )zw
    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
    C )unicodedatacategory
startswithappendjoin)r6   outputr   cats       r   remove_non_printing_charrB   \   sd     F  "4((>># 	d776??r   c            
           e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 	 d' fd
	Zdee	e
f         fdZede
fd            Zd Zd Zd Zd Zed             Zed             Zd Zd Zd Zd(dZd Zd Zd Z	 d)dee
         deee
                  dee
         fdZ	 d*dee
         deee
                  dedee
         f fd Z	 d)dee
         deee
                  dee
         fd!Z d)d"e	d#ee	         de!e	         fd$Z"d% Z#d& Z$ xZ%S )+FSMTTokenizera	  
    Construct an FAIRSEQ Transformer tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:

    - Moses preprocessing and tokenization.
    - Normalizing all inputs text.
    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
      "__classify__") to a vocabulary.
    - The argument `langs` defines a pair of languages.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        langs (`List[str]`, *optional*):
            A list of two languages to translate from and to, for instance `["en", "ru"]`.
        src_vocab_file (`str`, *optional*):
            File containing the vocabulary for the source language.
        tgt_vocab_file (`st`, *optional*):
            File containing the vocabulary for the target language.
        merges_file (`str`, *optional*):
            File containing the merges.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.

    	input_idsattention_maskNF<unk><s></s><pad>c
                    	 dd l }n# t          $ r t          d          w xY w|| _        || _        || _        || _        || _        i | _        i | _        i | _	        |r#t          |          dk    r|\  | _        | _        nt          d| d          t          |d          5 }t          j        |          | _        d d d            n# 1 swxY w Y   t          |d          5 }t          j        |          }d |                                D             | _        d d d            n# 1 swxY w Y   t          |d          5 }|                                                    d	          d d
         }d d d            n# 1 swxY w Y   d |D             }t-          t/          |t1          t          |                                        | _        i | _         t7                      j        d|||||||||	d	|
 d S )Nr   nYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.   zFarg `langs` needs to be a list of 2 langs, e.g. ['en', 'ru'], but got zw. Usually that means that tokenizer can't find a mapping for the given model path in  and other maps of this tokenizer.utf-8encodingc                     i | ]\  }}||	S  rR   .0kvs      r   
<dictcomp>z*FSMTTokenizer.__init__.<locals>.<dictcomp>   s    ???TQAq???r   
c                 `    g | ]+}t          |                                d d                   ,S )NrM   )tuplesplit)rT   merges     r   
<listcomp>z*FSMTTokenizer.__init__.<locals>.<listcomp>   s1    ???u%bqb)**???r   )	langsr
   r   r   do_lower_case	unk_token	bos_token	sep_token	pad_tokenrR   )
sacremosesImportErrorsmr
   r   r   r`   cache_moses_punct_normalizercache_moses_tokenizercache_moses_detokenizerlensrc_langtgt_lang
ValueErroropenjsonloadencoderitemsdecoderreadr\   dictziprange	bpe_rankscachesuper__init__)selfr_   r
   r   r   r`   ra   rb   rc   rd   kwargsre   src_vocab_handletgt_vocab_handle	tgt_vocabmerges_handlemerges	__class__s                    r   r|   zFSMTTokenizer.__init__   s   	 	 	 	M  	 ,,&* -/)%'"')$ 	SZZ1__+0(DM4==8Y^ 8 8 8   .7333 	77G9%566DL	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7 	7.7333 	@7G	"233I??Y__->->???DL	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ 	@ +000 	;M"''))//55crc:F	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	; 	;?????c&%F*<*<==>>
 	
))#'	
 	
 	
 	
 	
 	
 	
s>    "&CCC(8D,,D03D00FFFreturnc                 *    |                                  S N)get_src_vocabr}   s    r   	get_vocabzFSMTTokenizer.get_vocab   s    !!###r   c                     | j         S r   )src_vocab_sizer   s    r   
vocab_sizezFSMTTokenizer.vocab_size   s    ""r   c                     || j         vr%| j                            |          }|| j         |<   | j         |                             |          S Nlang)rh   rg   MosesPunctNormalizer	normalize)r}   r6   r   punct_normalizers       r   moses_punct_normzFSMTTokenizer.moses_punct_norm   sR    t888#w;;;FF6FD-d306@@FFFr   c                     || j         vr%| j                            |          }|| j         |<   | j         |                             |ddd          S )Nr   TF)aggressive_dash_splits
return_strescape)ri   rg   MosesTokenizertokenize)r}   r6   r   moses_tokenizers       r   moses_tokenizezFSMTTokenizer.moses_tokenize   sc    t111"g44$4??O/>D&t,)$/88% 9 
 
 	
r   c                     || j         vr%| j                            |          }|| j         |<   | j         |                             |          S r   )rj   rg   MosesDetokenizer
detokenize)r}   tokensr   moses_detokenizers       r   moses_detokenizezFSMTTokenizer.moses_detokenize   sR    t333 $ 8 8d 8 C C1BD(.+D1<<VDDDr   c                 n    t          |          }|                     ||          }t          |          }|S r   )r7   r   rB   )r}   r6   r   s      r   moses_pipelinezFSMTTokenizer.moses_pipeline  s6    $T**$$T400'--r   c                 *    t          | j                  S r   )rk   rr   r   s    r   r   zFSMTTokenizer.src_vocab_size      4<   r   c                 *    t          | j                  S r   )rk   rt   r   s    r   tgt_vocab_sizezFSMTTokenizer.tgt_vocab_size  r   r   c                 0    t          | j        fi | j        S r   )rv   rr   added_tokens_encoderr   s    r   r   zFSMTTokenizer.get_src_vocab      DL>>D$=>>>r   c                 0    t          | j        fi | j        S r   )rv   rt   added_tokens_decoderr   s    r   get_tgt_vocabzFSMTTokenizer.get_tgt_vocab  r   r   c                     t          |d d                   |d         dz   fz   }| j        v r j        |         S t          |          }|s|dz   S 	 t          | fd          }| j        vrn8|\  }}g }d}|t          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                     Y nw xY w||         |k    rC|t          |          dz
  k     r-||dz            |k    r|	                    ||z              |dz  }n |	                    ||                    |dz  }|t          |          k     t          |          }|}t          |          dk    rnt          |          }Wd	
                    |          }|d
k    rd}| j        |<   |S )NrY   </w>Tc                 T    j                             | t          d                    S )Ninf)ry   getfloat)pairr}   s    r   <lambda>z#FSMTTokenizer.bpe.<locals>.<lambda>   s     1C1CD%PU,,1W1W r   keyr   r   rM    z
  </w>z
</w>)r[   rz   r   minry   rk   indexextendrn   r>   r?   )
r}   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpezFSMTTokenizer.bpe  s   U3B3Z  E"I$6#88DJ:e$$$ 	"6>!	($W$W$W$WXXXFT^++"ME6HAc$ii--

5!,,A
 OOD1I...AA "   OODH---E 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA c$ii--  XHD4yyA~~!$9	(: xx~~:D 
5s   C 'C/.C/enc                 v   | j         }| j        r|                                }|r|                                }n.|                     ||          }|                     ||          }g }|D ]L}|rH|                    t          |                     |                              d                               M|S )av  
        Tokenize a string given language code using Moses.

        Details of tokenization:

            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
            - Install with `pip install sacremoses`

        Args:
            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
              languages. However, we don't enforce it.
            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
              (bool). If True, we only apply BPE.

        Returns:
            List of tokens.
        r   r   )	rl   r`   lowerr\   r   r   r   listr   )r}   r6   r   bypass_tokenizersplit_tokensr   s         r   	_tokenizezFSMTTokenizer._tokenizeB  s    * } 	 ::<<D 	8::<<DD&&t$&77D&&t$&77D 	F 	FE F##D%)>)>s)C)C$D$DEEEr   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)rr   r   ra   )r}   r   s     r   _convert_token_to_idz"FSMTTokenizer._convert_token_to_idi  s,    |t|'7'7'G'GHHHr   c                 B    | j                             || j                  S )z=Converts an index (integer) in a token (str) using the vocab.)rt   r   ra   )r}   r   s     r   _convert_id_to_tokenz"FSMTTokenizer._convert_id_to_tokenm  s    |t~666r   c                     d |D             }d                     |                                          }|                     || j                  }|S )z:Converts a sequence of tokens (string) in a single string.c                 b    g | ],}|                     d d                               dd           -S )r   r:   r   )r3   )rT   ts     r   r^   z:FSMTTokenizer.convert_tokens_to_string.<locals>.<listcomp>u  s6    JJJa!))C$$,,VS99JJJr   r:   )r?   r\   r   rm   )r}   r   r6   s      r   convert_tokens_to_stringz&FSMTTokenizer.convert_tokens_to_stringq  sO     KJ6JJJ&&(($$VT];;r   token_ids_0token_ids_1c                 6    | j         g}|||z   S ||z   |z   |z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A FAIRSEQ Transformer sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )sep_token_idr}   r   r   seps       r    build_inputs_with_special_tokensz.FSMTTokenizer.build_inputs_with_special_tokens{  s7    &  ! $$S ;.44r   already_has_special_tokensc                     |r$t                                          ||d          S |/dgt          |          z  dgz   dgt          |          z  z   dgz   S dgt          |          z  dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr   r   )r{   get_special_tokens_maskrk   )r}   r   r   r   r   s       r   r   z%FSMTTokenizer.get_special_tokens_mask  s    & & 	7722'[]a 3    "C#k***qc1aS3{;K;K5KLPQsRRc+&&&1#--r   c                     | j         g}|t          ||z             dgz  S t          ||z             dgz  t          ||z             dgz  z   S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
        Transformer sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).

        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An
        FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
        Nr   r   )r   rk   r   s       r   $create_token_type_ids_from_sequencesz2FSMTTokenizer.create_token_type_ids_from_sequences  sd    4  ! {S())QC//;$%%+c+2C.D.Ds.JJJr   save_directoryfilename_prefixc           	         t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t          |dd	
          5 }|                    t          j
        | j        ddd          dz              d d d            n# 1 swxY w Y   t          |dd	
          5 }d | j                                        D             }|                    t          j
        |ddd          dz              d d d            n# 1 swxY w Y   d}t          |dd	
          5 }	t          | j                                        d           D ][\  }
}||k    r t                              d| d           |}|	                    d                    |
          dz              |dz  }\	 d d d            n# 1 swxY w Y   |||fS )NzVocabulary path (z) should be a directoryr-   r:   r
   r   r   wrN   rO   rM   TF)indent	sort_keysensure_asciirX   c                     i | ]\  }}||	S rR   rR   rS   s      r   rW   z1FSMTTokenizer.save_vocabulary.<locals>.<dictcomp>  s    ???$!QA???r   r   c                     | d         S )Nr   rR   )kvs    r   r   z/FSMTTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^ r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   r   )ospathisdirloggererrorr?   VOCAB_FILES_NAMESro   writerp   dumpsrr   rt   rs   sortedry   warning)r}   r   r   r
   r   r   fr   r   writer
bpe_tokenstoken_indexs               r   save_vocabularyzFSMTTokenizer.save_vocabulary  sp   w}}^,, 	LLT^TTTUUUFoM_s222QbcsQtt
 
 oM_s222QbcsQtt
 
 glloM_s222QbcpQqq
 
 .#888 	cAGGDJt|ATYZZZ]aabbb	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c .#888 	`A??$,*<*<*>*>???IGGDJydQVWWWZ^^___	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` 	` +sW555 		+1$.2F2F2H2HN^N^+_+_+_  '
KK''NNM M M M   (ESXXj11D8999
		 		 		 		 		 		 		 		 		 		 		 		 		 		 		 ~{::s8   14D11D58D5AF,,F03F0BII"%I"c                 B    | j                                         }d |d<   |S )Nrg   )__dict__copy)r}   states     r   __getstate__zFSMTTokenizer.__getstate__  s#    ""$$dr   c                 h    || _         	 dd l}n# t          $ r t          d          w xY w|| _        d S )Nr   rL   )r   re   rf   rg   )r}   dre   s      r   __setstate__zFSMTTokenizer.__setstate__  s]    	 	 	 	M  	 s    ()	NNNNFrG   rH   rI   rJ   )r   Fr   )NF)&__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesr|   r   strintr   propertyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   boolr   r   r   r   r   r  __classcell__)r   s   @r   rD   rD   r   s       + +Z *$&67 @
 @
 @
 @
 @
 @
F$4S> $ $ $ $ #C # # # X#G G G
 
 
E E E   ! ! X! ! ! X!? ? ?? ? ?* * *X% % % %NI I I7 7 7   JN5 5953;DI3F5	c5 5 5 56 sx. .9.3;DI3F.ko.	c. . . . . .: JNK K9K3;DI3FK	cK K K KB"; ";c ";HSM ";]bcf]g "; "; "; ";H  
      r   rD   )r  rp   r   r4   r;   typingr   r   r   r   tokenization_utilsr   utilsr	   
get_loggerr  r   r   r   r7   rB   rD   rR   r   r   <module>r     s   % $  				 				     . . . . . . . . . . . . 5 5 5 5 5 5       
	H	%	% '&  
 
 
( ( (V
 
 
,T T T T T' T T T T Tr   