
    Ng                     >    d Z ddlmZ ddlmZ  G d de          ZdS )a(  
Multi-Word Expression Tokenizer

A ``MWETokenizer`` takes a string which has already been divided into tokens and
retokenizes it, merging multi-word expressions into single tokens, using a lexicon
of MWEs:


    >>> from nltk.tokenize import MWETokenizer

    >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
    >>> tokenizer.add_mwe(('in', 'spite', 'of'))

    >>> tokenizer.tokenize('Testing testing testing one two three'.split())
    ['Testing', 'testing', 'testing', 'one', 'two', 'three']

    >>> tokenizer.tokenize('This is a test in spite'.split())
    ['This', 'is', 'a', 'test', 'in', 'spite']

    >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
    ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']

    )
TokenizerI)Triec                   &    e Zd ZdZddZd Zd ZdS )MWETokenizerzhA tokenizer that processes tokenized text and merges multi-word expressions
    into single tokens.
    N_c                 D    |sg }t          |          | _        || _        dS )a  Initialize the multi-word tokenizer with a list of expressions and a
        separator

        :type mwes: list(list(str))
        :param mwes: A sequence of multi-word expressions to be merged, where
            each MWE is a sequence of strings.
        :type separator: str
        :param separator: String that should be inserted between words in a multi-word
            expression token. (Default is '_')

        N)r   _mwes
_separator)selfmwes	separators      M/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/mwe.py__init__zMWETokenizer.__init__(   s)      	D$ZZ
#    c                 :    | j                             |           dS )a  Add a multi-word expression to the lexicon (stored as a word trie)

        We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
        The key True marks the end of a valid MWE.

        :param mwe: The multi-word expression we're adding into the word trie
        :type mwe: tuple(str) or list(str)

        :Example:

        >>> tokenizer = MWETokenizer()
        >>> tokenizer.add_mwe(('a', 'b'))
        >>> tokenizer.add_mwe(('a', 'b', 'c'))
        >>> tokenizer.add_mwe(('a', 'x'))
        >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
        >>> tokenizer._mwes == expected
        True

        N)r	   insert)r   mwes     r   add_mwezMWETokenizer.add_mwe9   s     ( 	
#r   c                 (   d}t          |          }g }||k     r||         | j        v r|}| j        }d}||k     r=||         |v r3|||                  }|dz   }t          j        |v r|}||k     r
||         |v 3|dk    r|}t          j        |v s|dk    r8|                    | j                            |||                              |}nA|                    ||                    |dz  }n |                    ||                    |dz  }||k     |S )a  

        :param text: A list containing tokenized text
        :type text: list(str)
        :return: A list of the tokenized text with multi-words merged together
        :rtype: list(str)

        :Example:

        >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
        >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
        ['An', "hors+d'oeuvre", 'tonight,', 'sir?']

        r      )lenr	   r   LEAFappendr
   join)r   textinresultjtrie
last_matchs           r   tokenizezMWETokenizer.tokenizeO   sB    II!eeAw$*$$z
!eeQ4Q=DAAyD((%&
	 !eeQ4 "B&yD((JOOdo&:&:4!9&E&EFFF d1g...Qd1g&&&Q3 !ee4 r   )Nr   )__name__
__module____qualname____doc__r   r   r#    r   r   r   r   #   sP         $ $ $ $"  ,- - - - -r   r   N)r'   nltk.tokenize.apir   	nltk.utilr   r   r(   r   r   <module>r+      ss    . ) ( ( ( ( (      Y Y Y Y Y: Y Y Y Y Yr   