
    Ng!                     ~    d dl Z d dlZd dlZd dlmZmZ ddlmZmZ ddl	m
Z
 ddlmZ ddlmZ  G d d	e          ZdS )
    N)zipxrange   )	ErrorCodeLangDetectException)Language)NGram)unicode_blockc                       e Zd ZdZdZdZdZdZdZdZ	dZ
 ej        d	          Z ej        d
          Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd ZdS )Detectoraj  
    Detector class is to detect language from specified text.
    Its instance is able to be constructed via the factory class DetectorFactory.

    After appending a target text to the Detector instance with .append(string),
    the detector provides the language detection results for target text via .detect() or .get_probabilities().

    .detect() method returns a single language name which has the highest probability.
    .get_probabilities() methods returns a list of multiple languages and their probabilities.

    The detector has some parameters for language detection.
    See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).

    Example:

        from langdetect.detector_factory import DetectorFactory
        factory = DetectorFactory()
        factory.load_profile('/path/to/profile/directory')

        def detect(text):
            detector = factory.create()
            detector.append(text)
            return detector.detect()

        def detect_langs(text):
            detector = factory.create()
            detector.append(text)
            return detector.get_probabilities()
    g      ?g?i  g?gwJ?'  unknownz'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}z>[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}c                     |j         | _         |j        | _        |j        | _        t          j                    | _        d| _        d | _        | j        | _        d| _	        d| _
        d | _        d| _        d S )N    r   F)word_lang_prob_maplanglistseedrandomRandomtextlangprobALPHA_DEFAULTalphan_trialmax_text_length	prior_mapverbose)selffactorys     O/var/www/html/ai-engine/env/lib/python3.11/site-packages/langdetect/detector.py__init__zDetector.__init__8   sf    ")"<(L	moo	'
$    c                     d| _         d S )NT)r   r   s    r!   set_verbosezDetector.set_verboseF   s    r#   c                     || _         d S N)r   )r   r   s     r!   	set_alphazDetector.set_alphaI   s    


r#   c                    dgt          | j                  z  | _        d}t          t          | j                            D ]J}| j        |         }||v r7||         }|dk     rt	          t
          j        d          || j        |<   ||z  }K|dk    rt	          t
          j        d          t          t          | j                            D ]}| j        |xx         |z  cc<   dS )z3Set prior information about language probabilities.        r   z'Prior probability must be non-negative.z/More one of prior probability must be non-zero.N)lenr   r   r   r   r   InitParamError)r   r   sumpilangps         r!   set_prior_mapzDetector.set_prior_mapL   s    T]!3!33DN++,, 	 	A=#Dy  dOq55-i.FHqrrr$%q!	3;;%i&>@qrrrDN++,, 	& 	&AN1%	& 	&r#   c                     || _         dS )zqSpecify max size of target text to use for language detection.
        The default value is 10000(10KB).
        N)r   )r   r   s     r!   set_max_text_lengthzDetector.set_max_text_length]   s      /r#   c                 N   | j                             d|          }| j                            d|          }t          j        |          }d}t          t          t          |          | j                            D ](}||         }|dk    s|dk    r| xj	        |z  c_	        |})dS )zAppend the target text for language detection.
        If the total size of target text exceeds the limit size specified by
        Detector.set_max_text_length(int), the rest is cut down.
         r   N)
URL_REsubMAIL_REr	   normalize_vir   minr,   r   r   )r   r   prer/   chs        r!   appendzDetector.appendc   s    
 {sD))|T**!$''CIIt';<<== 	 	AaBSyyC3JJ		R		CC		 	r#   c                    d\  }}| j         D ]H}d|cxk    rdk    r	n n|dz  }|t          j        d          k    rt          |          dk    r|dz  }I|dz  |k     r&d}| j         D ]}|dk     sd|k     r||z  }|| _         d	S d	S )
zCleaning text to detect
        (eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
        )r   r   Azr   u   ̀zLatin Extended Additional   r   N)r   sixur
   )r   latin_countnon_latin_countr=   text_without_latins        r!   cleaning_textzDetector.cleaning_textr   s     (,$_) 	% 	%BbCq suX&&=+<+<@[+[+[1$?_,,!#i - -88sRxx&",&*DIII -,r#   c                 V    |                                  }|r|d         j        S | j        S )zsDetect language of the target text and return the language name
        which has the highest probability.
        r   )get_probabilitiesr0   UNKNOWN_LANG)r   probabilitiess     r!   detectzDetector.detect   s4     ..00 	) #((  r#   c                 l    | j         |                                  |                     | j                   S r(   )r   _detect_block_sort_probabilityr%   s    r!   rJ   zDetector.get_probabilities   s2    =    %%dm444r#   c                    |                                   |                                 }|st          t          j        d          dgt          | j                  z  | _        | j        	                    | j	                   t          | j                  D ]R}|                                 }| j        | j                            dd          | j        z  z   }d}	 |                     || j                            |          |           |dz  dk    rY|                     |          | j        k    s|| j        k    rn5| j        r(t-          j        d|                     |                     |dz  }t          t          | j                            D ]%}| j        |xx         ||         | j        z  z  cc<   &| j        r(t-          j        d	|                     |                     Td S )
NzNo features in text.r+         ?r   T   >r   z==>)rH   _extract_ngramsr   r   CantDetectErrorr,   r   r   r   r   r   r   _init_probabilityr   gaussALPHA_WIDTH_update_lang_probchoice_normalize_probCONV_THRESHOLDITERATION_LIMITr   rC   print_rP   )r   ngramstprobr   r/   js          r!   rO   zDetector._detect_block   s   %%'' 	Y%i&?AWXXXDM 2 22###%% 	@ 	@A))++DJ!2!23!<!<t?O!OOEA&&tT[-?-?-G-GOOOq5A::++D11D4GGG1PTPdKdKd| F
3(>(>t(D(DEEEQ C..// ; ;a   DGdl$::    | @
5$"8"8">">???!	@ 	@r#   c                     | j         t          | j                   S dt          | j                  z  gt          | j                  z  S )zzInitialize the map of language probabilities.
        If there is the specified prior map, use it as initial map.
        NrR   )r   listr,   r   r%   s    r!   rW   zDetector._init_probability   sC     >%'''#dm,,,-DM0B0BBBr#   c                 r   t          t          dt          j        dz                       }g }t                      }| j        D ]t}|                    |           |j        r|D ]R}t          |j                  |k     r n7|j        | d         }|r$|dk    r|| j	        v r|
                    |           Su|S )z!Extract n-grams from target text.r   Nr6   )re   r   r	   N_GRAMr   add_charcapitalwordr,   gramsr   r>   )r   RANGEresultngramr=   nws          r!   rU   zDetector._extract_ngrams   s    VAu|a/0011) 
	% 
	%BNN2   % %u{##a''EK$ %ca4+B&B&BMM!$$$r#   c           	      P   |	|| j         vrdS | j         |         }| j        rBt          j        |d|                     |          d|                     |                     || j        z  }t          t          |                    D ]}||xx         |||         z   z  cc<   dS )z:Update language probabilities with N-gram string(N=1,2,3).NF(z): T)	r   r   rC   r_   _unicode_encode_word_prob_to_string	BASE_FREQr   r,   )r   rb   wordr   lang_prob_mapweightr/   s          r!   rZ   zDetector._update_lang_prob   s    <4t'>>>5/5< 	tJtttT-A-A$-G-G-G-GIbIbcpIqIqIqrsss'D		"" 	1 	1AGGGva 000GGGGtr#   c                     d}t          t          |                    D ]%}||         }|dk    r|d| j        |         |fz  z  }&|S )Nr   gh㈵>z %s:%.5f)r   r,   r   )r   rb   rl   rc   r1   s        r!   rs   zDetector._word_prob_to_string   sW    D		"" 	= 	=AQAG||*a(8!'<<<r#   c                     dt          |          }}t          t          |                    D ]}||         |z  }||k     r|}|||<   |S )zRNormalize probabilities and check convergence by the maximun probability.
        r+   )sumr   r,   )r   rb   maxpr.   r/   r1   s         r!   r\   zDetector._normalize_prob   s[     #d))dD		"" 	 	AQ$AaxxDGGr#   c                 v      fdt           j        |          D             }|                    d           |S )Nc                 L    g | ] \  }}|j         k    t          ||          !S  )PROB_THRESHOLDr   ).0r0   r1   r   s      r!   
<listcomp>z.Detector._sort_probability.<locals>.<listcomp>   s6    iii	qQRUYUhQhQh(4##QhQhQhr#   T)reverse)r   r   sort)r   rb   rl   s   `  r!   rP   zDetector._sort_probability   sA    iiiiS5M5MiiiD!!!r#   c                    d}|D ]}|t          j        d          k    rct          dt          |          z             dd          }t	          |          dk     rd|z   }t	          |          dk     |d|dd	         z   z  }}||z  }|S )
Nr      i   rB      0z\ur   rS   )rC   rD   hexordr,   )r   ru   bufr=   sts        r!   rr   zDetector._unicode_encode   s     	 	BSU8__$$3r77*++ABB/"ggkkrB "ggkkur!A#w&r	
r#   N) __name__
__module____qualname____doc__r   rY   r^   r   r]   rt   rK   recompiler7   r9   r"   r&   r)   r2   r4   r>   rH   rM   rJ   rO   rW   rU   rZ   rs   r\   rP   rr   r~   r#   r!   r   r      sl        < MKONNILRZBCCFbjZ[[G      & & &"/ / /  + + +$! ! !5 5 5
@ @ @6C C C  &    	 	 	  

 
 
 
 
r#   r   )r   r   rC   	six.movesr   r   lang_detect_exceptionr   r   languager   utils.ngramr	   utils.unicode_blockr
   objectr   r~   r#   r!   <module>r      s     				 



 ! ! ! ! ! ! ! ! A A A A A A A A             . . . . . .l l l l lv l l l l lr#   