
    Ng=              
       x   d Z ddlZddlmZ ddlZddlmZ dZdZdZ	dZ
eed	d
dddde	df
Zed         e
gedd         R Z ej        d          Z ej        eej        ej        z  ej        z            Z ej        d          Z ej        d          Zd dZd!dZ G d de          Zd Zd Z	 	 	 	 d"dZdS )#a  
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
domains and tasks. The basic logic is this:

1. The tuple REGEXPS defines a list of regular expression
   strings.

2. The REGEXPS strings are put, in order, into a compiled
   regular expression object called WORD_RE, under the TweetTokenizer
   class.

3. The tokenization is done by WORD_RE.findall(s), where s is the
   user-supplied string, inside the tokenize() method of the class
   TweetTokenizer.

4. When instantiating Tokenizer objects, there are several options:
    * preserve_case. By default, it is set to True. If it is set to
      False, then the tokenizer will downcase everything except for
      emoticons.
    * reduce_len. By default, it is set to False. It specifies whether
      to replace repeated character sequences of length 3 or greater
      with sequences of length 3.
    * strip_handles. By default, it is set to False. It specifies
      whether to remove Twitter handles of text used in the
      `tokenize` method.
    * match_phone_numbers. By default, it is set to True. It indicates
      whether the `tokenize` method should look for phone numbers.
    N)List)
TokenizerIac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      </?3                       # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
  	(?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
u  
  (?:
    [\U0001F1E6-\U0001F1FF]{2}  # all enclosed letter pairs
    |
    # English flag
    \U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006e\U000E0067\U000E007F
    |
    # Scottish flag
    \U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F
    |
    # For Wales? Why Richard, it profit a man nothing to give his soul for the whole world … but for Wales!
    \U0001F3F4\U000E0067\U000E0062\U000E0077\U000E006C\U000E0073\U000E007F
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]uR   .(?:
        [🏻-🏿]?(?:‍.[🏻-🏿]?)+
        |
        [🏻-🏿]
    )a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
       z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);zZ(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))strictc                 d    |d}t          | t                    r|                     ||          S | S )Nutf-8)
isinstancebytesdecode)textencodingerrorss      P/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/casual.py_str_to_unicoder      s8    $ -{{8V,,,K     Tr   c                 d    fd}t                               |t          | |                    S )u  
    Remove entities from text by converting them to their
    corresponding unicode character.

    :param text: a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    :param list keep:  list of entity names which should not be replaced.    This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    :param bool remove_illegal: If `True`, entities that can't be converted are    removed. Otherwise, entities that can't be converted are kept "as
    is".

    :returns: A unicode string with the entities removed.

    See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

        >>> from nltk.tokenize.casual import _replace_html_entities
        >>> _replace_html_entities(b'Price: &pound;100')
        'Price: \xa3100'
        >>> print(_replace_html_entities(b'Price: &pound;100'))
        Price: £100
        >>>
    c                 P   |                      d          }|                      d          r}	 |                      d          rt          |d          }nt          |d          }d|cxk    rdk    r&n n#t          |f                              d          S nO# t          $ r d }Y nAw xY w|v r|                      d	          S t
          j        j                            |          }|'	 t          |          S # t          t          f$ r Y nw xY wrd
n|                      d	          S )N   r         
         cp1252r    )groupintr
   r   
ValueErrorhtmlentitiesname2codepointgetchrOverflowError)matchentity_bodynumberkeepremove_illegals      r   _convert_entityz/_replace_html_entities.<locals>._convert_entity  sO   kk!nn;;q>> 	C;;q>> 2 b11FF b11F
 6))))T))))) &++228<<<    d""{{1~~%]155kBBF6{{".    $7rrQ7s$   A(B B&%B&)C8 8DD)ENT_REsubr   )r   r)   r*   r   r+   s    ``  r   _replace_html_entitiesr.      sB    88 8 8 8 8 88 ::otX'F'FGGGr   c                   v    e Zd ZdZdZdZ	 	 	 	 ddZdedee         fdZ	e
dd
            Ze
dd            ZdS )TweetTokenizera  
    Tokenizer for tweets.

        >>> from nltk.tokenize import TweetTokenizer
        >>> tknzr = TweetTokenizer()
        >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
        >>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE
        ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->',
         '<--']

    Examples using `strip_handles` and `reduce_len parameters`:

        >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
        >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
        >>> tknzr.tokenize(s1)
        [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
    NTFc                 >    || _         || _        || _        || _        dS )ae  
        Create a `TweetTokenizer` instance with settings for use in the `tokenize` method.

        :param preserve_case: Flag indicating whether to preserve the casing (capitalisation)
            of text used in the `tokenize` method. Defaults to True.
        :type preserve_case: bool
        :param reduce_len: Flag indicating whether to replace repeated character sequences
            of length 3 or greater with sequences of length 3. Defaults to False.
        :type reduce_len: bool
        :param strip_handles: Flag indicating whether to remove Twitter handles of text used
            in the `tokenize` method. Defaults to False.
        :type strip_handles: bool
        :param match_phone_numbers: Flag indicating whether the `tokenize` method should look
            for phone numbers. Defaults to True.
        :type match_phone_numbers: bool
        Npreserve_case
reduce_lenstrip_handlesmatch_phone_numbers)selfr3   r4   r5   r6   s        r   __init__zTweetTokenizer.__init__L  s)    . +$*#6   r   r   returnc                 t   t          |          }| j        rt          |          }| j        rt	          |          }t
                              d|          }| j        r| j        	                    |          }n| j
        	                    |          }| j        st          t          d |                    }|S )zTokenize the input text.

        :param text: str
        :rtype: list(str)
        :return: a tokenized list of strings; joining this list returns        the original string if `preserve_case=False`.
        \1\1\1c                 b    t                               |           r| n|                                 S )N)EMOTICON_REsearchlower)xs    r   <lambda>z)TweetTokenizer.tokenize.<locals>.<lambda>  s%    K$6$6q$9$9Hqqqwwyy r   )r.   r5   remove_handlesr4   reduce_lengtheningHANG_REr-   r6   PHONE_WORD_REfindallWORD_REr3   listmap)r7   r   	safe_textwordss       r   tokenizezTweetTokenizer.tokenizeh  s     &d++ 	(!$''D? 	,%d++DKK	400	# 	4&..y99EEL((33E! 	HH5QQ E r   regex.Patternc                     t          |           j        sgt          j        dd                    t
                     dt          j        t          j        z  t          j        z            t          |           _        t          |           j        S )zCore TweetTokenizer regex(|))	type_WORD_REregexcompilejoinREGEXPSVERBOSEIUNICODEr7   s    r   rG   zTweetTokenizer.WORD_RE  sm     Dzz" 	"'-(CHHW%%((('%-7# #DJJ Dzz""r   c                     t          |           j        sgt          j        dd                    t
                     dt          j        t          j        z  t          j        z            t          |           _        t          |           j        S )z#Secondary core TweetTokenizer regexrO   rP   rQ   )	rR   _PHONE_WORD_RErT   rU   rV   REGEXPS_PHONErX   rY   rZ   r[   s    r   rE   zTweetTokenizer.PHONE_WORD_RE  sm     Dzz( 	(-.CHH]++...'%-7) )DJJ% Dzz((r   TFFT)r9   rM   )__name__
__module____qualname____doc__rS   r]   r8   strr   rL   propertyrG   rE   r   r   r   r0   r0   2  s         ( HN  7 7 7 78S T#Y    < # # # X# ) ) ) X) ) )r   r0   c                 V    t          j        d          }|                    d|           S )ze
    Replace repeated character sequences of length 3 or greater with sequences
    of length 3.
    z	(.)\1{2,}r;   )rT   rU   r-   )r   patterns     r   rC   rC     s'    
 mL))G;;y$'''r   c                 8    t                               d|           S )z4
    Remove Twitter username handles from text.
     )
HANDLES_REr-   )r   s    r   rB   rB     s    
 >>#t$$$r   Fc                 N    t          ||||                              |           S )z:
    Convenience function for wrapping the tokenizer.
    r2   )r0   rL   )r   r3   r4   r5   r6   s        r   casual_tokenizerl     s4     ##/	  
 htnnr   )Nr   )r   Tr   r_   )rc   r    typingr   rT   nltk.tokenize.apir   	EMOTICONSURLSFLAGSPHONE_REGEXrW   r^   rU   rD   rX   rY   rZ   r=   r,   rj   r   r.   r0   rC   rB   rl   r   r   r   <module>rs      s   @         ( ( ( ( ( ($		$)f	 	$ 	(.	 

/"J [77122;77 %-/
0
0 emIu}uw'>'NOO 
.	/	/ U]H 
   8H 8H 8H 8H|h) h) h) h) h)Z h) h) h)`( ( (% % %      r   