
    Ng[                     :    d Z ddlZddlmZ  G d de          ZdS )a  
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
sentence per line; thus only final period is tokenized.

Tok-tok has been tested on, and gives reasonably good results for English,
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
The input should be in UTF-8 encoding.

Reference:
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
    N)
TokenizerIc                   P   e Zd ZdZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ej        d          dfZ	 ej        d          d	fZ
 ej        d
          dfZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ej        d          dfZ ed          Z ed          Z ed          Z ej        de d          d fZ ej        de d          d fZ ej        de d          d fZ ej        d!          d"fZ ej        d#          d$fZ ej        d%          d&fZ ej        d'          d&fZ  ej        d(          d)fZ! ej        d*          d+fZ" ej        d,          dfZ#eeeeeee e
eeeeeeeeeee	eeeee#gZ$d0d.Z%d/S )1ToktokTokenizeru  
    This is a Python port of the tok-tok.pl from
    https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl

    >>> toktok = ToktokTokenizer()
    >>> text = u'Is 9.5 or 525,600 my favorite number?'
    >>> print(toktok.tokenize(text, return_str=True))
    Is 9.5 or 525,600 my favorite number ?
    >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
    >>> print(toktok.tokenize(text, return_str=True))
    The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
    >>> text = u'¡This, is a sentence with weird» symbols… appearing everywhere¿'
    >>> expected = u'¡ This , is a sentence with weird » symbols … appearing everywhere ¿'
    >>> assert toktok.tokenize(text, return_str=True) == expected
    >>> toktok.tokenize(text) == [u'¡', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'»', u'symbols', u'…', u'appearing', u'everywhere', u'¿']
    True
         u1   ([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])z \1 u   ([({\[“‘„‚«‹「『])u
   ([–—])z& z&amp; 	z &#9; z\|z &#124; u   (?<!,)([,،])(?![,\d])u	   (['’`])z ` ` z `` z ' ' z '' z
(?<!\.)\.$z .u    (?<!\.)\.\s*(["'’»›”]) *$z . \1z(,{2,})z(-{2,})z(\.{2,})u   ([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢u   )]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣u   $¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦z([z])z\1 z:(?!//)z : z\?(?!\S)z ? z(:\/\/)[\S+\.\S+\/\S+][\/]z / z /z^ + z\s+$
z {2,}Fc                     t          |          }| j        D ]\  }}|                    ||          }t          |                                          }|r|n|                                S )N)strTOKTOK_REGEXESsubstripsplit)selftext
return_strregexpsubstitutions        P/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tokenize/toktok.pytokenizezToktokTokenizer.tokenize   sg    4yy$($7 	2 	2 FL::lD11DD 4::<<  !3tttzz||3    N)F)&__name__
__module____qualname____doc__recompileNON_BREAKINGFUNKY_PUNCT_1FUNKY_PUNCT_2EN_EM_DASHES	AMPERCENTTABPIPECOMMA_IN_NUMPROB_SINGLE_QUOTESSTUPID_QUOTES_1STUPID_QUOTES_2FINAL_PERIOD_1FINAL_PERIOD_2MULTI_COMMASMULTI_DASHES
MULTI_DOTSr   
OPEN_PUNCTCLOSE_PUNCTCURRENCY_SYMOPEN_PUNCT_RECLOSE_PUNCT_RECURRENCY_SYM_RE	URL_FOE_1	URL_FOE_2	URL_FOE_3	URL_FOE_4LSTRIPRSTRIP	ONE_SPACEr   r    r   r   r   r      sH        & 2:h'',L BJSTTV]]MBJABBGKM2:l++W4L 
4  (*I
"*T

H
$C2:ej(D 2:788'AL $L117: bj**G3O bj**G3O  RZ..5N  RZ GHH(RN 2:j))72L2:j))72LK(('1J 	/
 
J #	)
 
K 3	5 L BJ2J22233V;MRZ 4[ 4 4 455v=N bj!6l!6!6!677?O 
:&&.I
;''/I
8995@I
5!!6)I RZ#FRZ  $&F
8$$c)I 	1N64 4 4 4 4 4r   r   )r   r   nltk.tokenize.apir   r   r<   r   r   <module>r>      sj     
			 ( ( ( ( ( (W4 W4 W4 W4 W4j W4 W4 W4 W4 W4r   