
    Ng                         d Z ddlmZ ddlmZ 	 ddlZn# e$ r dZY nw xY w G d d          Zd Z	e
dk    r e	             dS dS )	a  
A module for language identification using the TextCat algorithm.
An implementation of the text categorization algorithm
presented in Cavnar, W. B. and J. M. Trenkle,
"N-Gram-Based Text Categorization".

The algorithm takes advantage of Zipf's law and uses
n-gram frequencies to profile languages and text-yet to
be identified-then compares using a distance measure.

Language n-grams are provided by the "An Crubadan"
project. A corpus reader was created separately to read
those files.

For details regarding the algorithm, see:
https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf

For details about An Crubadan, see:
https://borel.slu.edu/crubadan/index.html
    )maxsize)trigramsNc                   F    e Zd ZdZi ZdZdZi Zd Zd Z	d Z
d Zd Zd	 ZdS )
TextCatN<>c                     t           st          d          ddlm} || _        | j                                        D ]}| j                            |           d S )Nzclassify.textcat requires the regex module that supports unicode. Try '$ pip install regex' and see https://pypi.python.org/pypi/regex for further details.r   )crubadan)reOSErrornltk.corpusr
   _corpuslangs	lang_freq)selfr
   langs      Q/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/classify/textcat.py__init__zTextCat.__init__7   s|     	#   	)(((((L&&(( 	) 	)DL""4((((	) 	)    c                 .    t          j        dd|          S )z)Get rid of punctuation except apostrophesz[^\P{P}\']+ )r   subr   texts     r   remove_punctuationzTextCat.remove_punctuationG   s    vnb$///r   c                    ddl m}m} |                     |          } ||          } |            }|D ]L}t	          | j        |z   | j        z             }d |D             }	|	D ]}
|
|v r||
xx         dz  cc<   d||
<   M|S )z'Create FreqDist of trigrams within textr   )FreqDistword_tokenizec                 8    g | ]}d                      |          S )r   )join).0tris     r   
<listcomp>z#TextCat.profile.<locals>.<listcomp>U   s"    KKKsbggcllKKKr      )nltkr   r   r   r   _START_CHAR	_END_CHAR)r   r   r   r   
clean_texttokensfingerprintttoken_trigram_tuplestoken_trigramscur_trigrams              r   profilezTextCat.profileK   s    00000000,,T22
z**hjj 	1 	1A#+D,<q,@4>,Q#R#R KK6JKKKN- 1 1+--,,,1,,,,/0K,,	1 r   c                 J   | j                             |          }d}||v r{t          |                                                              |          }t          |                                                              |          }t          ||z
            }nt          }|S )zgCalculate the "out-of-place" measure between the
        text and language profile for a single trigramr   )r   r   listkeysindexabsr   )r   r   trigramtext_profilelang_fddistidx_lang_profileidx_texts           r   	calc_distzTextCat.calc_dist_   s     ,((..g#GLLNN3399'BBL--//0066w??H '(233DD
 Dr   c                     i }|                      |          }| j        j                                        D ](}d}|D ]}||                     |||          z  }|||<   )|S )zOCalculate the "out-of-place" measure between
        the text and all languagesr   )r/   r   _all_lang_freqr2   r;   )r   r   	distancesr/   r   	lang_distr5   s          r   
lang_distszTextCat.lang_distst   s     	,,t$$L/4466 	( 	(D I" D DT^^D'7CCC		'IdOOr   c                 v    |                      |          | _        t          | j        | j        j                  S )zYFind the language with the min distance
        to the text and return its ISO 639-3 code)key)r@   last_distancesmingetr   s     r   guess_languagezTextCat.guess_language   s4     #ood334&D,?,CDDDDr   )__name__
__module____qualname__r   fingerprintsr&   r'   rC   r   r   r/   r;   r@   rF    r   r   r   r   /   s        GLKIN) ) ) 0 0 0  (  *  $E E E E Er   r   c                  N  
 ddl m}  g d}dddddd	d
ddd	}t                      }|D ]}|                     |          t	                    dz
  }t          t          t                              }d}t          d|          D ]@
dd                    
fdt          d|
                   D                       z   }||z  }At          d|dd         z   dz              |
                    |          }	t          d|	 d||	          d           t          d           d S )Nr   )udhr)	zKurdish-UTF8zAbkhaz-UTF8zFarsi_Persian-UTF8z
Hindi-UTF8zHawaiian-UTF8zRussian-UTF8zVietnamese-UTF8zSerbian_Srpski-UTF8zEsperanto-UTF8zNorthern Kurdish	AbkhazianzIranian PersianHindiHawaiianRussian
VietnameseSerbian	Esperanto)	kmrabkpeshinhawrusviesrpepor$   r    c                 ,    g | ]}         |         S rK   rK   )r!   jiraw_sentencess     r   r#   zdemo.<locals>.<listcomp>   s#    &V&V&Vq}Q'7':&V&V&Vr   zLanguage snippet:    z...zLanguage detection: z ()z############################################################################################################################################)r   rM   r   sentslenr1   mapranger    printrF   )rM   r   friendlytccur_langrowscolssamplecur_sentguessra   rb   s             @@r   demorr      s         
 
 
E " 
 
H 
B  

8,,=!!A%C]++,, q$ 	 	ASXX&V&V&V&V&VE!TRSWDUDU&V&V&VWWWHhFF 	"VAcE]2U:;;;!!&))@U@@huo@@@AAAi# r   __main__)__doc__sysr   	nltk.utilr   regexr   ImportErrorr   rr   rG   rK   r   r   <module>ry      s    *               	BBB\E \E \E \E \E \E \E \E@. . .b zDFFFFF s    