
    g<                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZmZ ddlmZ  ej        e          Zdd	iZ G d
 d          Z G d de          ZdS )z"Tokenization class for model MyT5.    N)defaultdict)DictListOptionalTupleUnion   )
AddedTokenPreTrainedTokenizer)logging
vocab_filezbyte_maps.jsonc            	       0   e Zd ZdZdZdeeeeef         f         fdZdeeee	e
e         f         f         dedefdZdeeef         d	eeee	e
e         f         f         fd
Zde
e         d	ede
e         f         fdZdde
e         d	e
e         fdZdS )ByteRewriteraZ  
    Byte rewriter class for MyT5 tokenizer.
    This class is used to rewrite bytes using a hash tree. The hash tree is constructed from a set of rewriting rules.

    Args:
        rewriting_rules (`str` or `Dict[str, str]`):
            A path to a json file containing the rewriting rules or a dictionary containing the rewriting rules.

    z[LEAF]rewriting_rulesc                    t          |t                    r=t          |d          5 }t          j        |          }d d d            n# 1 swxY w Y   n4t          |t
                    st          dt          |                     |                     |          | _	        d |
                                D             }|                     |          | _        d S )NrzDrewriting_rules should be either a path to json file or a dict, got c                     i | ]\  }}||	S  r   ).0kvs      f/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/myt5/tokenization_myt5.py
<dictcomp>z)ByteRewriter.__init__.<locals>.<dictcomp>8   s    "L"L"LDAq1a"L"L"L    )
isinstancestropenjsonloaddict
ValueErrortypeconstruct_hash_tree	hash_treeitemsreverse_hash_tree)selfr   freverse_rewriting_ruless       r   __init__zByteRewriter.__init__.   s   os++ 	os++ /q"&)A,,/ / / / / / / / / / / / / / /OT22 	nW[\kWlWlnn   11/BB"L"LO4I4I4K4K"L"L"L!%!9!9:Q!R!Rs   AAAr$   byte_in_sequencebyte_out_sequencec                     |                     d          }|                     d          }|}|D ]}||vri ||<   ||         }||| j        <   dS )zL
        Add a leaf with the output byte sequence to the hash tree.
         N)splitLEAF)r'   r$   r+   r,   byte_in_listbyte_out_listtree_pointerbs           r   add_leafzByteRewriter.add_leaf;   so     (--c22)//44  	+ 	+A$$"$Q'?LL"/TYr   returnc                     t          t                    }d t          d          D             D ]}|g||         | j        <   |                                D ]\  }}|                     |||           |S )zE
        Construct a hash tree for rewritten byte sequences.
        c              3      K   | ]}|d V  	dS )02xNr   )r   xs     r   	<genexpr>z3ByteRewriter.construct_hash_tree.<locals>.<genexpr>O   s&      11Q**111111r      )r   r    ranger0   r%   r5   )r'   r   r$   r4   in_sequenceout_sequences         r   r#   z ByteRewriter.construct_hash_treeJ   s      %%	11eCjj111 	* 	*A'(cIaL##)8)>)>)@)@ 	@ 	@%KMM)[,????r   byte_sequenceNc                 R    | j         }|D ]}||v r	||         } dS || j                 S )zW
        Search the hash tree and return the rewritten byte sequence if found.
        N)r$   r0   )r'   r@   r3   r4   s       r   search_hash_treezByteRewriter.search_hash_treeW   sD     ~ 	 	AL  +AttDI&&r   Fin_bytesc                 j   g }d}d}|t          |          k     r|s| j        n| j        }t          |t          |                    D ]>}||         }||v r	||         }n||k    r|g}	|} n n| j        |v r|| j                 }	|}?|                    |	           |dz   }|t          |          k     |S )a6  
        Rewrite a sequence of bytes using the hash tree.

        Args:
            in_bytes (`List[str]`): A list of bytes to be rewritten.
            reverse (`bool`): If True, decoding is performed with the reverse hash tree.
        Returns:
            `List[str]`: The rewritten byte sequence.
        r      )lenr$   r&   r=   r0   extend)
r'   rC   reverse	out_bytesb_startb_endr3   jr4   cur_leafs
             r   rewrite_byteszByteRewriter.rewrite_bytesd   s     	H%%18T4>>d>TL7CMM22  QK$$#/?LL'\\ !sHEEE9,,+DI6HEX&&&aiG! H%%$ r   )F)__name__
__module____qualname____doc__r0   r   r   r   r*   r    r   r5   r#   rB   rN   r   r   r   r   r   !   sB         DSc4S>.A(B S S S S0$sE$S	/,B'B"C 0WZ 0or 0 0 0 04S> d3PUVZ\`ad\eVePfKfFg    'd3i 'E$S	/<R ' ' ' '   d3i  49            r   r   c            
           e Zd ZdZddgZeZ	 	 	 	 	 d!	 d" fd
Zed             Z	d Z
	 d#dee         deee                  ded	ee         f fdZdee         d	ee         fdZ	 d$dee         deee                  d	ee         fdZ	 d$dee         deee                  d	ee         fdZded	ee         fdZd Zd Zdee         d	ee         fdZdee         d	ee         fdZd Zd$dedee         d	ee         fd Z xZS )%MyT5Tokenizera  
    Construct a MyT5 tokenizer.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`): The file containing the byte rewriting rules.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    	input_idsattention_mask</s><unk><pad>}   Nr6   c           	      d   |dk    r|d t          |          D             }nb|dk    r\|Zt          |          dk    rGt          t          t          d |                              }||k    rt	          d| d| d          t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}t          |t                    rt          |dd          n|}|||d	| _        t          | j                  | _	        d
| _
        t          j        t          |d                    | _        t          | j        d                   | _        t          | j        d                   | _         t%                      j        d|||d|d| d S )Nr   c                     g | ]}d | d	S )z
<extra_id_>r   r   is     r   
<listcomp>z*MyT5Tokenizer.__init__.<locals>.<listcomp>   s$    (U(U(Uq):a):):):(U(U(Ur   c                 >    t          dt          |           v           S )Nextra_id)boolr   )r:   s    r   <lambda>z(MyT5Tokenizer.__init__.<locals>.<lambda>   s    Ds1vv9M4N4N r   zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to MyT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r   rE      r<   r   decompose_map	merge_map)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   )r=   rF   setfilterr!   r   r   r
   _added_tokens_decoderoffset_utf_vocab_sizer   r   r   	byte_mapsr   decompose_rewritermerge_rewritersuperr*   )
r'   r   rj   rk   rl   rm   rn   kwargsextra_tokens	__class__s
            r   r*   zMyT5Tokenizer.__init__   s    q==6>(U(UE)DTDT(U(U(U%%]]8DMfIgIgjkIkIks6*N*NPi#j#jkkllLy(( (y ( (Sl ( ( (   HRR[]`GaGapJydCCCCgp	GQR[]`GaGapJydCCCCgp	GQR[]`GaGapJydCCCCgp	)2yY%O%O"$455# 4
C#8#899".t~o/N"O"O*4>++FGG 	
&?	
 	
 	
 	
 	
 	
 	
r   c                     | j         S N)rs   )r'   s    r   
vocab_sizezMyT5Tokenizer.vocab_size   s    ##r   c                       fdt           j         j        z             D             }|                     j                   |S )Nc                 <    i | ]}                     |          |S r   )convert_ids_to_tokens)r   r_   r'   s     r   r   z+MyT5Tokenizer.get_vocab.<locals>.<dictcomp>   s)    ```a++A..```r   )r=   r}   rr   updateadded_tokens_encoder)r'   vocabs   ` r   	get_vocabzMyT5Tokenizer.get_vocab   sI    ````5SWS^A^;_;_```T.///r   Ftoken_ids_0token_ids_1already_has_special_tokensc                     |r$t                                          ||d          S |dgt          |          z  dgz   S dgt          |          z  dgz   dgt          |          z  z   dgz   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr   rE   )rw   get_special_tokens_maskrF   )r'   r   r   r   rz   s       r   r   z%MyT5Tokenizer.get_special_tokens_mask   s    $ & 	7722'[]a 3   
 C#k***qc11c+&&&1#-!s;7G7G1GHA3NNr   	token_idsc                     t          |          dk    r0|d         | j        k    rt          j        d| j         d           |S || j        gz   S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)rF   eos_token_idwarningswarnrj   )r'   r   s     r   _add_eos_if_not_presentz%MyT5Tokenizer._add_eos_if_not_present   si    y>>A)B-43D"D"DM+T^ + + +    1222r   c                 z    | j         g}|t          ||z             dgz  S t          ||z   |z   |z             dgz  S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. MyT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r   rF   )r'   r   r   eoss       r   $create_token_type_ids_from_sequencesz2MyT5Tokenizer.create_token_type_ids_from_sequences  sS       !{S())QC//;${2S899QC??r   c                 h    |                      |          }||S |                      |          }||z   S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        )r   )r'   r   r   s      r    build_inputs_with_special_tokensz.MyT5Tokenizer.build_inputs_with_special_tokens  sA    & 22;??66{CCK,,r   textc                 n    d |                     d          D             }|                     |          }|S )zTake as input a string and return a list of strings (tokens) for words/sub-words.
        Represents tokens in two character hex formatc                     g | ]}|d S )r9   r   r^   s     r   r`   z+MyT5Tokenizer._tokenize.<locals>.<listcomp>:  s    ;;;Q**;;;r   utf-8)encodemorphological_encode)r'   r   rx   tokenss       r   	_tokenizezMyT5Tokenizer._tokenize6  s;     <;dkk'&:&:;;;**622r   c                 b    t          |          dk    rd}nt          |d          | j        z   }|S )z0Converts a token (str) in an id using the vocab.rg   N   )rF   intrr   )r'   tokentoken_ids      r   _convert_token_to_idz"MyT5Tokenizer._convert_token_to_id>  s3     u::??HH5"~~3Hr   c                     || j         z
  d}|S )z=Converts an index (integer) in a token (str) using the vocab.r9   )rr   )r'   indexr   s      r   _convert_id_to_tokenz"MyT5Tokenizer._convert_id_to_tokenH  s    4;&,,r   indicesc                 v    | j                             |d          }| j                            |d          }|S )NFrH   )ru   rN   rv   r'   r   s     r   r   z"MyT5Tokenizer.morphological_encodeM  s=    )777OO%33GU3KKr   c                 v    | j                             |d          }| j                            |d          }|S )NTr   )rv   rN   ru   r   s     r   morphological_decodez"MyT5Tokenizer.morphological_decodeS  s=    %33GT3JJ)777NNr   c                    d}g }|D ]`}|| j         v r!|                    | j         |                    ,|| j        v r|                    |           K|                    |           a|                     |          }t	          | j                                                   t	          | j                  z  }|D ]7}||v r|t          |d          z  }|t                              |          z  }8|                    dd          }|S )z:Converts a sequence of tokens (string) in a single string.r   r   ignore)errors)	added_tokens_decoderappendr   r   ro   valuesbytesfromhexdecode)r'   r   bstring
out_tokensr   _added_tokensstrings          r   convert_tokens_to_stringz&MyT5Tokenizer.convert_tokens_to_stringY  s%   
 	) 	)E111!!$";E"BCCCC$333!!%((((!!%((((..z::
D5<<>>??#dF_B`B`` 	0 	0E%%50005==///99r   save_directoryfilename_prefixc                 |   t           j                            |          r6t           j                            ||r|dz   ndt          d         z             }n|r|dz   nd|z   }t          |dd          5 }|                    t          j        | j	        dd	                     d d d            n# 1 swxY w Y   |fS )
N- r   wr   )encodingrg   F)indentensure_ascii)
ospathisdirjoinVOCAB_FILES_NAMESr   writer   dumpsrt   )r'   r   r   r   writers        r   save_vocabularyzMyT5Tokenizer.save_vocabularyp  s   7==(( 	]/!Q3!6!6rUfgsUt t JJ 4CJ/C//n\J*cG444 	SLLDN15QQQRRR	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S 	S}s   40B00B47B4)rW   rX   rY   rZ   N)r6   N)NFr|   )rO   rP   rQ   rR   model_input_namesr   vocab_files_namesr*   propertyr}   r   r   r   r   rc   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__)rz   s   @r   rT   rT      s        4 %&67)
 "&,
 
,
 ,
 ,
 ,
 ,
 ,
\ $ $ X$   sxO O9O3;DI3FOkoO	cO O O O O O8	3c 	3tCy 	3 	3 	3 	3 JN@ @9@3;DI3F@	c@ @ @ @0 JN- -9-3;DI3F-	c- - - -4c S	        
DI $s)    DI $s)      .	 	c 	HSM 	]bcf]g 	 	 	 	 	 	 	 	r   rT   )rR   r   r   r   collectionsr   typingr   r   r   r   r   tokenization_utilsr
   r   utilsr   
get_loggerrO   loggerr   r   rT   r   r   r   <module>r      s    ) (  				  # # # # # # 5 5 5 5 5 5 5 5 5 5 5 5 5 5 A A A A A A A A       
	H	%	% "#34 c c c c c c c cLr r r r r' r r r r rr   