
    g0;                         d Z ddlZddlZddlZddlZddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZ  ej        e          Zddd	Zd
 Z G d d          Zd Zd Z G d de	          ZdS )z$Tokenization classes for OpenAI GPT.    N)OptionalTuple   )PreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 ^    |                                  } | sg S |                                 }|S )z@Runs basic whitespace cleaning and splitting on a piece of text.)stripsplit)texttokenss     j/var/www/html/ai-engine/env/lib/python3.11/site-packages/transformers/models/openai/tokenization_openai.pywhitespace_tokenizer   $   s.    ::<<D 	ZZ\\FM    c                   L    e Zd ZdZ	 	 	 	 	 ddZddZd ZddZd Zd	 Z	d
 Z
dS )BasicTokenizera  
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    TNc                 n    |g }|| _         t          |          | _        || _        || _        || _        d S N)do_lower_casesetnever_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfr   r   r   r   r   s         r   __init__zBasicTokenizer.__init__E   sE     K*{++&<#* 0r   c                 h   |r'| j                             t          |                    n| j         }|                     |          }| j        r|                     |          }t          j        d|          }t          |          }g }|D ]}||vrV| j	        r3|
                                }| j        dur|                     |          }n| j        r|                     |          }|                    |                     ||                     t          d                    |                    }|S )aj  
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        NFCF )r   unionr   _clean_textr   _tokenize_chinese_charsunicodedata	normalizer   r   lowerr   _run_strip_accentsextend_run_split_on_puncjoin)r   r   r   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r   tokenizezBasicTokenizer.tokenizeU   sE    CNcd&,,S-=-=>>>SWSc%% & 	6//55D"-"7t"D"D)*ABB  	M 	MEK''% ;!KKMME)66 $ 7 7 > >' ; 33E::E 7 7{ K KLLLL+CHH\,B,BCCr   c                     t          j        d|          }g }|D ]2}t          j        |          }|dk    r|                    |           3d                    |          S )z$Strips accents from a piece of text.NFDMn )r'   r(   categoryappendr-   )r   r   outputcharcats        r   r*   z!BasicTokenizer._run_strip_accents{   si    $UD11 	  	 D&t,,Cd{{MM$wwvr   c                    | j         r|||v r|gS t          |          }d}d}g }|t          |          k     r|||         }t          |          r|                    |g           d}n4|r|                    g            d}|d                             |           |dz  }|t          |          k     |d |D             S )z&Splits punctuation on a piece of text.Nr   TF   c                 8    g | ]}d                      |          S )r7   )r-   ).0xs     r   
<listcomp>z5BasicTokenizer._run_split_on_punc.<locals>.<listcomp>   s"    +++q

+++r   )r   listlenr   r9   )r   r   r   charsistart_new_wordr:   r;   s           r   r,   z!BasicTokenizer._run_split_on_punc   s    $ 	)@T[EXEX6MT

#e**nn8Dt$$ (tf%%%!%! &MM"%%%!&r
!!$'''FA #e**nn ,+F++++r   c                 ,   g }|D ]{}t          |          }|                     |          r@|                    d           |                    |           |                    d           f|                    |           |d                    |          S )z)Adds whitespace around any CJK character.r#   r7   )ord_is_chinese_charr9   r-   r   r   r:   r;   cps        r   r&   z&BasicTokenizer._tokenize_chinese_chars   s     	$ 	$DTB$$R(( $c"""d###c""""d####wwvr   c                     |dk    r|dk    sT|dk    r|dk    sH|dk    r|dk    s<|dk    r|dk    s0|d	k    r|d
k    s$|dk    r|dk    s|dk    r|dk    s|dk    r|dk    rdS dS )z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF )r   rM   s     r   rK   zBasicTokenizer._is_chinese_char   s     6\\bFllfvg"--g"--g"--g"--fvg"--4ur   c                    g }|D ]g}t          |          }|dk    s|dk    st          |          r-t          |          r|                    d           R|                    |           hd                    |          S )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r#   r7   )rJ   r   r	   r9   r-   rL   s        r   r%   zBasicTokenizer._clean_text   s     	$ 	$DTBQww",,+d*;*;,d## $c""""d####wwvr   )TNTNTr   )__name__
__module____qualname____doc__r    r3   r*   r,   r&   rK   r%   rO   r   r   r   r   .   s         0 #1 1 1 1 $ $ $ $L	 	 	, , , ,,    0    r   r   c                 ~    t                      }| d         }| dd         D ]}|                    ||f           |}|S )z
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    r   r?   N)r   add)wordpairs	prev_charr;   s       r   	get_pairsrZ      sP    
 EEEQIQRR  		9d#$$$		Lr   c                    |                      dd          } |                      dd          } |                      dd          } |                      dd          } |                      dd          } t          j        d	d
|           } t          j        dd|           } t          j        dd|           } |                                 S )zm
    fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization
    u   —-u   –u   ―u   …z...   ´'zD(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)z \1 z\s*\n\s*z 
 z[^\S\n]+r#   )replaceresubr   )r   s    r   text_standardizerb      s     <<s##D<<s##D<<s##D<<u%%D<<c""D6]_fhlmmD6+vt,,D6+sD))D::<<r   c                        e Zd ZdZeZddgZd fd	Zed             Z	ed             Z
d Zd	 Zd
 Zd Zd Zd Zddedee         dee         fdZ xZS )OpenAIGPTTokenizera(  
    Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:

    - lowercases all inputs,
    - uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
      `BasicTokenizer` if not.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    	input_idsattention_mask<unk>c           
      H   	 dd l }ddlm}  |            }|j        | _        |j        | _        nF# t          $ r9 t                              d           t          d          | _        d | _        Y nw xY wt          |d          5 }t          j        |          | _        d d d            n# 1 swxY w Y   d | j                                        D             | _        t          |d          5 }	|	                                                    d	          d
d         }
d d d            n# 1 swxY w Y   d |
D             }
t%          t'          |
t)          t+          |
                                        | _        i | _         t1                      j        dd|i| d S )Nr   )EnglishzQftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.T)r   utf-8encodingc                     i | ]\  }}||	S rO   rO   )rA   kvs      r   
<dictcomp>z/OpenAIGPTTokenizer.__init__.<locals>.<dictcomp>  s    >>>A1>>>r   
r?   r>   c                 P    g | ]#}t          |                                          $S rO   )tupler   )rA   merges     r   rC   z/OpenAIGPTTokenizer.__init__.<locals>.<listcomp>  s(    ;;;5%&&;;;r   	unk_tokenrO   )ftfyspacy.lang.enri   	tokenizernlpfix_textImportErrorloggerwarningr   openjsonloadencoderitemsdecoderreadr   dictziprangerE   	bpe_rankscachesuperr    )r   r   r   ru   kwargsrv   ri   _nlpvocab_handlemerges_handlemerges	__class__s              r   r    zOpenAIGPTTokenizer.__init__  s   
	!KKK------799D~DH MDMM 	! 	! 	!NNnooo%D999DH DMMM	!
 *w/// 	3<9\22DL	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3>>););)=)=>>>+000 	<M"''))//55ad;F	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	< 	<;;F;;;c&%F*<*<==>>
779777777s4   ,0 A A32A3B--B14B110D--D14D1c                     dS )NTrO   r   s    r   r   z OpenAIGPTTokenizer.do_lower_case  s    tr   c                 *    t          | j                  S r   )rE   r   r   s    r   
vocab_sizezOpenAIGPTTokenizer.vocab_size  s    4<   r   c                 0    t          | j        fi | j        S r   )r   r   added_tokens_encoderr   s    r   	get_vocabzOpenAIGPTTokenizer.get_vocab#  s    DL>>D$=>>>r   c                     t          |d d                   |d         dz   fz   }| j        v r j        |         S t          |          }|s|dz   S 	 t          | fd          }| j        vrn8|\  }}g }d}|t          |          k     r	 |                    ||          }	|                    |||	                    |	}n-# t          $ r  |                    ||d                     Y nw xY w||         |k    rC|t          |          dz
  k     r-||dz            |k    r|	                    ||z              |dz  }n |	                    ||                    |dz  }|t          |          k     t          |          }|}t          |          dk    rnt          |          }Wd	
                    |          }|d
k    rd}| j        |<   |S )Nr>   </w>Tc                 T    j                             | t          d                    S )Ninf)r   getfloat)pairr   s    r   <lambda>z(OpenAIGPTTokenizer.bpe.<locals>.<lambda>0  s     1C1CD%PU,,1W1W r   keyr   r?      r#   z
  </w>z
</w>)rs   r   rZ   minr   rE   indexr+   
ValueErrorr9   r-   )
r   r1   rW   rX   bigramfirstsecondnew_wordrG   js
   `         r   bpezOpenAIGPTTokenizer.bpe&  s   U3B3Z  E"I$6#88DJ:e$$$ 	"6>!	($W$W$W$WXXXFT^++"ME6HAc$ii--

5!,,A
 OOD1I...AA "   OODH---E 7e##CIIM(9(9d1q5kV>S>SOOEFN333FAAOODG,,,FA c$ii--  XHD4yyA~~!$9	(: xx~~:D 
5s   C 'C/.C/c           	         g }| j         h| j                            |          }|D ]J}|                    t	          |                     |                              d                               Kn|                     t          |                      |                              }|D ]a}|                    t	          |                     |j        	                                                              d                               b|S )zTokenize a string.Nr#   )
rz   ry   r3   r+   rD   r   r   rb   r   r)   )r   r   r0   r1   s       r   	_tokenizezOpenAIGPTTokenizer._tokenizeR  s    = 8$$T**D F F##D%)>)>s)C)C$D$DEEEEF 88,T]]4-@-@AABBD S S##D%*2B2B2D2D)E)E)K)KC)P)P$Q$QRRRRr   c                 r    | j                             || j                             | j                            S )z0Converts a token (str) in an id using the vocab.)r   r   ru   )r   r1   s     r   _convert_token_to_idz'OpenAIGPTTokenizer._convert_token_to_ida  s,    |t|'7'7'G'GHHHr   c                 B    | j                             || j                  S )z0Converts an id in a token (BPE) using the vocab.)r   r   ru   )r   r   s     r   _convert_id_to_tokenz'OpenAIGPTTokenizer._convert_id_to_tokene  s    |t~666r   c                 |    d                     |                              dd                                          }|S )z:Converts a sequence of tokens (string) in a single string.r7   r   r#   )r-   r_   r   )r   r   
out_strings      r   convert_tokens_to_stringz+OpenAIGPTTokenizer.convert_tokens_to_stringi  s3    WWV__,,VS99??AA
r   Nsave_directoryfilename_prefixreturnc           	         t           j                            |          s t                              d| d           d S t           j                            ||r|dz   ndt          d         z             }t           j                            ||r|dz   ndt          d         z             }t          |dd	          5 }|                    t          j
        | j        d
dd          dz              d d d            n# 1 swxY w Y   d}t          |dd	          5 }|                    d           t          | j                                        d           D ][\  }}	||	k    r t                              d| d           |	}|                    d                    |          dz              |dz  }\	 d d d            n# 1 swxY w Y   ||fS )NzVocabulary path (z) should be a directoryr\   r7   r   r   wrj   rk   r   TF)indent	sort_keysensure_asciirq   r   z#version: 0.2
c                     | d         S )Nr?   rO   )kvs    r   r   z4OpenAIGPTTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^ r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r#   r?   )ospathisdirr|   errorr-   VOCAB_FILES_NAMESr~   writer   dumpsr   sortedr   r   r}   )
r   r   r   r   
merge_filefr   writer
bpe_tokenstoken_indexs
             r   save_vocabularyz"OpenAIGPTTokenizer.save_vocabularyn  s   w}}^,, 	LLT^TTTUUUFW\\oM_s222QbcoQpp
 

 W\\oM_s222QbcpQqq
 

 *cG444 	cGGDJt|ATYZZZ]aabbb	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c 	c *cG444 
	LL*++++1$.2F2F2H2HN^N^+_+_+_  '
KK''NNM
 M M M   (ESXXj11D8999

	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 :%%s%   <4C<<D D BGG
G)rg   r   )rQ   rR   rS   rT   r   vocab_files_namesmodel_input_namesr    propertyr   r   r   r   r   r   r   r   strr   r   r   __classcell__)r   s   @r   rd   rd      s(        ( *$&678 8 8 8 8 80   X ! ! X!? ? ?* * *X  I I I7 7 7  
& &c &HSM &]bcf]g & & & & & & & &r   rd   )rT   r   r   r`   r'   typingr   r   tokenization_utilsr   r   r   r	   utilsr
   
get_loggerrQ   r|   r   r   r   rZ   rb   rd   rO   r   r   <module>r      sH   + *  				 				     " " " " " " " " c c c c c c c c c c c c       
	H	%	%     ^ ^ ^ ^ ^ ^ ^ ^B
 
 
  ^& ^& ^& ^& ^&, ^& ^& ^& ^& ^&r   