
    Ngu                     Z    d Z ddlZddlZddlmZ ddlmZ ddlmZ  G d de          Z	dS )a  
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
    N)perluniprops)
TokenizerI)xml_unescapec            	       x   e Zd ZdZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ej        d          d	fZ	 ej        d
          dfZ
 ej        d          d	fZee	e
egZ ed                     e ej        d                                        Z ed                     e ej        d                                        Z ed                     e ej        d                                        Z ej        dde          Z ej        dde          Z ej        dde          Z ej        d          dfZ ej        de de d          d	fZ ej        de de d          dfZ ej        de d          dfZeeeegZd ZddZ	 ddZ dS )NISTTokenizeruT  
    This NIST tokenizer is sentence-based instead of the original
    paragraph-based tokenization from mteval-14.pl; The sentence-based
    tokenization is consistent with the other tokenizers available in NLTK.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()
    >>> s = "Good muffins cost $3.88 in New York."
    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
    >>> nist.tokenize(s, lowercase=False) == expected_cased
    True
    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
    True

    The international_tokenize() is the preferred function when tokenizing
    non-european text, e.g.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()

    # Input strings.
    >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...'
    >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
    >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'

    # Expected tokens.
    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'阿里巴巴集团控股', u'有限公司', u')']
    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'ˈæ', u'm']
    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'楽天株式会社', u'Rakuten', u'Kabushiki', u'-', u'gaisha']

    >>> nist.international_tokenize(albb)[:10] == expected_albb
    True
    >>> nist.international_tokenize(amz)[:10] == expected_amz
    True
    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
    True

    # Doctest for patching issue #1926
    >>> sent = u'this is a foo☄sentence.'
    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'☄', u'sentence', u'.']
    >>> nist.international_tokenize(sent) == expected_sent
    True
    z	<skipped> u     z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)NumberPunctuationSymbolz[]^\\-]z\\\g<0>z([ -]+)z([z])([z])c                     | j         \  }}|                    ||          }t          |          }| j        \  }}|                    ||          }|S )z8Performs the language independent string substituitions.)
STRIP_SKIPsubr   STRIP_EOL_HYPHEN)selftextregexpsubstitutions       N/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/nist.pylang_independent_subz"NISTTokenizer.lang_independent_sub   sU    
  $zz,--D!!#4zz,--    FTc                    t          |          }|                     |          }|rAd|z   dz   }|r|                                }| j        D ]\  }}|                    ||          }d                    |                                          }t          |                                          }|r|n|                                S Nr	   )strr   lowerLANG_DEPENDENT_REGEXESr   joinsplitstrip)r   r   	lowercasewestern_lang
return_strr   r   s          r   tokenizezNISTTokenizer.tokenize   s    4yy((.. 	6:#D $zz||(,(C 6 6$zz,55xx

%% 4::<<  !3tttzz||3r   c                    t          |          }| j        \  }}|                    ||          }| j        \  }}|                    ||          }t	          |          }|r|                                }| j        D ]\  }}|                    ||          }d                    |                                	                                          }|r|n|	                                S r   )
r   r   r   r   r   r   INTERNATIONAL_REGEXESr   r   r   )r   r   r    split_non_asciir"   r   r   s          r   international_tokenizez$NISTTokenizer.international_tokenize   s     4yy  $zz,--#4zz,--D!! 	 ::<<D$($> 	2 	2 FL::lD11DD xx

**,,--!3tttzz||3r   N)FTF)!__name__
__module____qualname____doc__recompiler   r   PUNCTPERIOD_COMMA_PRECEEDPERIOD_COMMA_FOLLOWDASH_PRECEED_DIGITr   r   r   setr   chars
pup_number	pup_punct
pup_symbolr   number_regexpunct_regexsymbol_regexNONASCIIPUNCT_1PUNCT_2SYMBOLSr%   r   r#   r'    r   r   r   r      s       + +\ K((",J!rz(++S0BJ8997BE%2:&899:E$"*%788*D#L11:= 		 RWWSS!3!3H!=!=>>??@@JBGGCC 2 2= A ABBCCDDIRWWSS!3!3H!=!=>>??@@J 26*j*==L"&Z;;K26*j*==L rz*++W4H 	
999+999::G
 	
999999::G
 bj.l...//8G%wA
 
 
4 4 4 4( GL4 4 4 4 4 4r   r   )
r+   ior,   nltk.corpusr   nltk.tokenize.apir   nltk.tokenize.utilr   r   r>   r   r   <module>rC      s     
			 				 $ $ $ $ $ $ ( ( ( ( ( ( + + + + + +Y4 Y4 Y4 Y4 Y4J Y4 Y4 Y4 Y4 Y4r   