
    gA                     ,   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZddlmZ ddlmZ  ej        e          Zdd	d
dddZdZ G d de          Zdede
ee	f         dej        fdZdeddfdZdedee
ef         fdZdS )    N)Path)copyfile)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)loggingz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_filetokenizer_config_fileu   ▁c            
           e Zd ZdZeZddgZ ej        d          Z		 	 	 	 	 	 	 	 	 d.de
eeef                  ddf fdZd ZdedefdZd ZdefdZdedee         fdZdedefdZ fdZ fdZdee         defdZd/dee         fdZd Zd Zedefd            Zd/ded e
e         dee         fd!Z defd"Z!d# Z"d$ Z#defd%Z$d&eddfd'Z%d( Z&d) Z'	 d0d*ed+e
e         d,e(dee         fd-Z) xZ*S )1MarianTokenizeraB  
    Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        source_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the source language.
        target_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the target language.
        source_lang (`str`, *optional*):
            A string representing the source language.
        target_lang (`str`, *optional*):
            A string representing the target language.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        model_max_length (`int`, *optional*, defaults to 512):
            The maximum sentence length the model accepts.
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import MarianForCausalLM, MarianTokenizer

    >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
    >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
    >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

    >>> outputs = model(**inputs)  # should work
    ```	input_idsattention_maskz>>.+<<N<unk></s><pad>   Fsp_model_kwargsreturnc                 p   |i n|| _         t          |                                          sJ d|             || _        t	          |          | _        t          |          | j        vrt          d          t          |	          | j        v sJ |rDt	          |          | _        d | j        	                                D             | _
        g | _        n>d | j        	                                D             | _
        d | j        D             | _        || _        || _        ||g| _        t          || j                   | _        t          || j                   | _        | j        | _        | j        | _        |                                   t+                      j        d|||||	|
| j         ||d	| d S )Nzcannot find spm source z <unk> token must be in the vocabc                     i | ]\  }}||	S  r   .0kvs      j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/marian/tokenization_marian.py
<dictcomp>z,MarianTokenizer.__init__.<locals>.<dictcomp>   s    IIITQAqIII    c                     i | ]\  }}||	S r   r   r    s      r$   r%   z,MarianTokenizer.__init__.<locals>.<dictcomp>   s    BBBTQAqBBBr&   c                 f    g | ].}|                     d           |                    d          ,|/S )z>>z<<)
startswithendswithr!   r"   s     r$   
<listcomp>z,MarianTokenizer.__init__.<locals>.<listcomp>   s?    2v2v2vall[_N`N`2vefeoeopteueu2v12v2v2vr&   )	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabsr   )r   r   existsr3   	load_jsonencoderstrKeyErrortarget_encoderitemsdecodersupported_language_codesr-   r.   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizersuper__init__)selfr   r   r   r   r-   r.   r/   r0   r1   r2   r   r3   kwargs	__class__s                 r$   rE   zMarianTokenizer.__init__k   s     &5%<rr/J&&((PP*PJ*P*PPPP. ''y>>--=>>>9~~---- 	w"+,=">">DIIT-@-F-F-H-HIIIDL,.D))BBT\-?-?-A-ABBBDL2v2vdl2v2v2vD)&&$j1 #:t/CDD":t/CDD?#| 	    	
##- 0/+	
 	
 	
 	
 	
 	
 	
r&   c                     	 ddl m}  || j                  j        | _        d S # t
          t          f$ r  t          j        d           d | _        Y d S w xY w)Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                     | S Nr   )xs    r$   <lambda>z3MarianTokenizer._setup_normalizer.<locals>.<lambda>   s    Q r&   )	
sacremosesrJ   r-   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)rF   rJ   s     r$   rC   z!MarianTokenizer._setup_normalizer   s}    	/777777#7#78H#I#I#SD   ./ 	/ 	/ 	/M@AAA#.;D    	/s    $ -AArM   c                 4    |r|                      |          ndS )zHCover moses empty string edge case. They return empty list for '' input! )rQ   )rF   rM   s     r$   rP   zMarianTokenizer.normalize   s     *+3t##A&&&3r&   c                 X    | j                             || j         | j                           S rL   )rB   getr/   )rF   tokens     r$   _convert_token_to_idz$MarianTokenizer._convert_token_to_id   s%    #''t/CDN/STTTr&   textc                     | j                             |          }|r|                    d          gng }|| j                             d|          fS )z6Remove language codes like >>fr<< before sentencepiecer   rW   )language_code_rematchgroupsub)rF   r\   r_   codes       r$   remove_language_codez$MarianTokenizer.remove_language_code   sR    %++D11).6ekk!nn%%BT*..r48888r&   c                 ~    |                      |          \  }}| j                            |t                    }||z   S )N)out_type)rc   rA   encoder7   )rF   r\   rb   piecess       r$   	_tokenizezMarianTokenizer._tokenize   s>    ..t44
d!(((<<f}r&   indexc                 B    | j                             || j                  S )z?Converts an index (integer) in a token (str) using the decoder.)r;   rY   r/   )rF   ri   s     r$   _convert_id_to_tokenz$MarianTokenizer._convert_id_to_token   s    |t~666r&   c                 8     t                      j        |fi |S )ao  
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `List[str]`: The list of decoded sentences.
        )rD   batch_decode)rF   	sequencesrG   rH   s      r$   rm   zMarianTokenizer.batch_decode   s$    * $uww#I88888r&   c                 8     t                      j        |fi |S )a  
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        )rD   decode)rF   	token_idsrG   rH   s      r$   rp   zMarianTokenizer.decode   s#    0 uww~i226222r&   tokensc                 J   | j         r| j        n| j        }g }d}|D ]A}|| j        v r!||                    |          |z   dz   z  }g },|                    |           B||                    |          z  }|                    t          d          }|                                S )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserW    )	_decode_use_source_tokenizerr?   r@   all_special_tokensdecode_piecesappendreplaceSPIECE_UNDERLINEstrip)rF   rr   sp_modelcurrent_sub_tokens
out_stringrZ   s         r$   convert_tokens_to_stringz(MarianTokenizer.convert_tokens_to_string   s    &*&G\4??T_
 	1 	1E///h445GHH5PSVVV
%'"""))%0000h,,-?@@@
''(8#>>
!!!r&   c                 8    ||| j         gz   S ||z   | j         gz   S )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)rF   token_ids_0token_ids_1s      r$    build_inputs_with_special_tokensz0MarianTokenizer.build_inputs_with_special_tokens	  s/    $"3!444[(D,=+>>>r&   c                 6    | j         | _        | j        | _        d S rL   )r?   rA   r6   rB   rF   s    r$   _switch_to_input_modez%MarianTokenizer._switch_to_input_mode  s    ?#|r&   c                 H    | j         | _        | j        r| j        | _        d S d S rL   )r@   rA   r3   r9   rB   r   s    r$   _switch_to_target_modez&MarianTokenizer._switch_to_target_mode  s2    ? 	7#'#6D   	7 	7r&   c                 *    t          | j                  S rL   )lenr6   r   s    r$   
vocab_sizezMarianTokenizer.vocab_size  s    4<   r&   save_directoryfilename_prefixc                    t           j                            |          s t                              d| d           d S g }| j        rt           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t          | j	        |           t          | j
        |           |                    |           |                    |           n_t           j                            ||r|dz   ndt          d         z             }t          | j	        |           |                    |           t          t          d         t          d         g| j        | j        | j        g          D ];\  }}}	t           j                            ||r|dz   nd|z             }
t           j                            |          t           j                            |
          k    rEt           j                            |          r&t%          ||
           |                    |
           t           j                            |          sft'          |
d	          5 }|	                                }|                    |           d d d            n# 1 swxY w Y   |                    |
           =t-          |          S )
NzVocabulary path (z) should be a directory-rW   r   r   r   r   wb)ospathisdirloggererrorr3   joinVOCAB_FILES_NAMES	save_jsonr6   r9   rx   zipr=   r?   r@   abspathisfiler   openserialized_model_protowritetuple)rF   r   r   saved_filesout_src_vocab_fileout_tgt_vocab_fileout_vocab_filespm_save_filenamespm_orig_path	spm_modelspm_save_pathficontent_spiece_models                r$   save_vocabularyzMarianTokenizer.save_vocabulary  s   w}}^,, 	LLT^TTTUUUF 	/!#*9A3&&rEVW^E__" " "$*9A3&&rEVWjEkk" " dl$6777d)+=>>>122212222W\\/!Q3!6!6rUfgnUo o N dlN333~...;>|,.?.MNN_do.<
 <
 	2 	27}i
 GLL/!Q3!6!6rUf f M w}--1O1OOOTVT[TbTbcpTqTqO666""=1111W^^M22 2-.. 3"+4+K+K+M+M(HH12223 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ""=111[!!!s   *J<<K 	K 	c                 *    |                                  S rL   )get_src_vocabr   s    r$   	get_vocabzMarianTokenizer.get_vocabJ  s    !!###r&   c                 0    t          | j        fi | j        S rL   )dictr6   added_tokens_encoderr   s    r$   r   zMarianTokenizer.get_src_vocabM  s    DL>>D$=>>>r&   c                 0    t          | j        fi | j        S rL   )r   r9   added_tokens_decoderr   s    r$   get_tgt_vocabzMarianTokenizer.get_tgt_vocabP  s    D'EE4+DEEEr&   c                 v    | j                                         }|                    d dD                        |S )Nc                     i | ]}|d S rL   r   r+   s     r$   r%   z0MarianTokenizer.__getstate__.<locals>.<dictcomp>V  s    rrrQrrrr&   )r?   r@   rA   rQ   r   )__dict__copyupdate)rF   states     r$   __getstate__zMarianTokenizer.__getstate__S  sD    ""$$rrqrrr	
 	
 	
 r&   dc                      | _         t           d          si  _         fd j        D             \   _         _         j         _                                          d S )Nr   c              3   B   K   | ]}t          |j                  V  d S rL   )r>   r   )r!   frF   s     r$   	<genexpr>z/MarianTokenizer.__setstate__.<locals>.<genexpr>a  s1      +f+fRSHQ8L,M,M+f+f+f+f+f+fr&   )r   hasattrr   r=   r?   r@   rA   rC   )rF   r   s   ` r$   __setstate__zMarianTokenizer.__setstate__Z  so     t.// 	&#%D +f+f+f+fW[We+f+f+f(?     r&   c                     dS )zJust EOS   r   )rF   argsrG   s      r$   num_special_tokens_to_addz)MarianTokenizer.num_special_tokens_to_adde  s    qr&   c                 |    t          | j                                      | j                   fd|D             S )Nc                      g | ]
}|v rd ndS )r   r   r   )r!   rM   all_special_idss     r$   r,   z7MarianTokenizer._special_token_mask.<locals>.<listcomp>l  s'    >>>QQ/))q>>>r&   )setr   removeunk_token_id)rF   seqr   s     @r$   _special_token_maskz#MarianTokenizer._special_token_maski  sD    d233t0111>>>>#>>>>r&   r   r   already_has_special_tokensc                     |r|                      |          S ||                      |          dgz   S |                      ||z             dgz   S )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.Nr   )r   )rF   r   r   r   s       r$   get_special_tokens_maskz'MarianTokenizer.get_special_tokens_maskn  sb     & 	M++K888 ++K88A3>>++K+,EFF!LLr&   )	NNNr   r   r   r   NFrL   )NF)+__name__
__module____qualname____doc__r   vocab_files_namesmodel_input_namesrecompiler^   r   r   r7   r   rE   rC   rP   r[   rc   r   rh   intrk   rm   rp   r   r   r   r   propertyr   r	   r   r   r   r   r   r   r   r   boolr   __classcell__)rH   s   @r$   r   r   ,   sB       8 8t *$&67!rz(++ 48<
 <
 "$sCx.1<
 
<
 <
 <
 <
 <
 <
|/ / /43 43 4 4 4 4U U U9 9 9 9 9c d3i    
7# 7# 7 7 7 79 9 9 9 9.3 3 3 3 34"tCy "S " " " " ? ?QUVYQZ ? ? ? ?, , ,7 7 7
 !C ! ! ! X!+" +"c +"HSM +"]bcf]g +" +" +" +"Z$4 $ $ $ $? ? ?F F Fd    	!d 	!t 	! 	! 	! 	!  ? ? ? in	M 	M	M.6tn	Mae	M	c	M 	M 	M 	M 	M 	M 	M 	Mr&   r   r   r   r   c                 R    t          j        di |}|                    |            |S )Nr   )sentencepieceSentencePieceProcessorLoad)r   r   spms      r$   r>   r>   z  s,    

.
A
A
A
ACHHTNNNJr&   c                     t          |d          5 }t          j        | |d           d d d            d S # 1 swxY w Y   d S )Nw   )indent)r   jsondump)datar   r   s      r$   r   r     s    	dC %A	$!$$$$% % % % % % % % % % % % % % % % % %s   6::c                 ~    t          | d          5 }t          j        |          cd d d            S # 1 swxY w Y   d S )Nr)r   r   load)r   r   s     r$   r5   r5     s|    	dC Ay||                 s   266)r   r   r   rT   pathlibr   shutilr   typingr   r   r   r   r	   r
   r   tokenization_utilsr   utilsr   
get_loggerr   r   r   rz   r   r7   r   r>   r   r5   r   r&   r$   <module>r      s    				 				              : : : : : : : : : : : : : : : :     5 5 5 5 5 5       
	H	%	% ,4    
KM KM KM KM KM) KM KM KM\
3 c3h M<`    %# %$ % % % %
C E$*-      r&   