
    Ng~0                     X   d Z ddlZddlZddlmZ ddlmZmZ 	 ddl	m
Z
 n# e$ r Y nw xY wddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ  G d de          Z G d de          Zd Zd Zd ZddZd Zd Z G d de          ZddZ	 e dk    r ed            ed           dS dS )z
Named entity chunker
    N)ElementTree)ClassifierBasedTaggerpos_tag)MaxentClassifier)ChunkParserI)
ChunkScorefind)word_tokenize)Treec                   ,    e Zd ZdZddZd Zd Zd ZdS )NEChunkParserTaggerz2
    The IOB tagger used by the chunk parser.
    Nc                 @    t          j        | || j        |           d S )N)trainclassifier_builder
classifier)r   __init___classifier_builder)selfr   r   s      S/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/chunk/named_entity.pyr   zNEChunkParserTagger.__init__$   s5    &#7!		
 	
 	
 	
 	
 	
    c                 2    t          j        |ddd          S )Niis      )	algorithmgaussian_prior_sigmatrace)r   r   r   r   s     r   r   z'NEChunkParserTagger._classifier_builder,   s(    %!"
 
 
 	
r   c                     	 | j         }nD# t          $ r7 ddlm} t	          |                    d                    | _         | j         }Y nw xY w|S )Nr   )wordszen-basic)_en_wordlistAttributeErrornltk.corpusr!   set)r   wlr!   s      r   _english_wordlistz%NEChunkParserTagger._english_wordlist5   sm    	#"BB 	# 	# 	#)))))) #EKK
$;$; < <D"BBB		#
 	s   
 >A
Ac                    ||         d         }t          ||         d                   }|dk    rd x}}d x}}	d x}
x}}n|dk    r[||dz
           d                                         }d }t          ||dz
           d                   }d }	||dz
           d         }d x}
}n||dz
           d                                         }||dz
           d                                         }t          ||dz
           d                   }t          ||dz
           d                   }	||dz
           }||dz
           }t          |          }
|t          |          dz
  k    r	d x}}d x}}n|t          |          dz
  k    rK||dz            d                                         }||dz            d                                         }d }d }n||dz            d                                         }||dz            d                                         }||dz            d                                         }||dz            d                                         }i dddt          |          dt          |          d|d d	                                         d
|dd                                          d|d|d||                                 v d|d|d|d|d|d|                                 d| d| d| d|
 d| }|S )Nr   r   r   biasTshapewordlenprefix3   suffix3poswordzen-wordlistprevtagprevposnextposprevwordnextwordzword+nextpos+zpos+prevtagzshape+prevtag)simplify_poslowerr*   lenr'   )r   tokensindexhistoryr1   r0   r5   prevprevwordr3   prevprevpos	prevshaper2   prevprevtagr6   nextnextwordr4   nextnextposfeaturess                     r   _feature_detectorz%NEChunkParserTagger._feature_detector?   s   e}Q6%=+,,A::&**H|$((Gk044I4++aZZeai(+1133HL"6%!)#4Q#788GKeai(+G&**Ieai(+1133H!%!),Q/5577L"6%!)#4Q#788G&veai'8';<<Keai(G!%!),KhICKK!O##&**H|$((Gkkc&kkAo%%eai(+1133HUQY'*0022GLKKeai(+1133HUQY'*0022G!%!),Q/5577L +A.4466K
D
U4[[
 s4yy
 tBQBx~~''	

 tBCCy((
 3
 D
 DD$:$:$<$<<
 w
 w
 w
 
 
 tzz||77g77
 c--G--
  	55G55!
& r   )NN)__name__
__module____qualname____doc__r   r   r'   rE    r   r   r   r      s_         
 
 
 

 
 
  8 8 8 8 8r   r   c                   @    e Zd ZdZd Zd Zd Zd Zed             Z	dS )NEChunkParser2
    Expected input: list of pos-tagged words
    c                 0    |                      |           d S N)_trainr   s     r   r   zNEChunkParser.__init__   s    Er   c                 d    | j                             |          }|                     |          }|S )z8
        Each token should be a pos-tagged word
        )_taggertag_tagged_to_parse)r   r;   taggedtrees       r   parsezNEChunkParser.parse   s1     !!&))$$V,,r   c                 N      fd|D             }t          |           _        d S )Nc                 :    g | ]}                     |          S rJ   )_parse_to_tagged).0sr   s     r   
<listcomp>z(NEChunkParser._train.<locals>.<listcomp>   s'    ;;;q$''**;;;r   )r   )r   rR   )r   corpuss   ` r   rP   zNEChunkParser._train   s2    ;;;;F;;;*888r   c                 6   t          dg           }|D ]\  }}|dk    r|                    |           "|                    d          r-|                    t          |dd         |g                     d|                    d          r|r]t          |d         t                     rB|d                                         |dd         k    r|d                             |           |                    t          |dd         |g                     |S )zH
        Convert a list of tagged tokens to a chunk-parse tree.
        SOB-r   NI-)r   append
startswith
isinstancelabel)r   tagged_tokenssenttokrS   s        r   rT   zNEChunkParser._tagged_to_parse   s    C}}% 		6 		6HCczzC    %% 6DQRR3%001111%% 6 6JtBx66 648>>;K;KsSTSUSUw;V;VHOOC((((KKSWse 4 4555r   c                    g }| D ]}t          |t                    rt          |          dk    rt          d           :|                    |d         d|                                 f           |dd         D ].}|                    |d|                                 f           /|                    |df           |S )zH
        Convert a chunk-parse tree to a list of tagged tokens.
        r   z"Warning -- empty chunk in sentencerb   r   Nrc   ra   )rg   r   r:   printre   rh   )rj   tokschildrk   s       r   rZ   zNEChunkParser._parse_to_tagged   s    
  		* 		*E%&& *u::??>???U1X';EKKMM';';<=== 9 = =CKK&:5;;==&:&: ;<<<<= UCL))))r   N)
rF   rG   rH   rI   r   rW   rP   rT   staticmethodrZ   rJ   r   r   rL   rL   z   su             9 9 9  $   \  r   rL   c                 *   t          j        d| t           j                  rdS t          j        d| t           j                  rdS t          j        d| t           j                  r.|                                 rdS |                                 rdS dS d	S )
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$numberz\W+$punctz\w+$upcasedowncase	mixedcaseother)rematchUNICODEistitleislower)r1   s    r   r*   r*      s    	x4dBJGG x	'4	,	, 
w	'4	,	, <<>> 	8\\^^ 	:;wr   c                 f    |                      d          rdS |                     d          d         S )NV-r   )rf   split)r\   s    r   r8   r8      s/    ||C swws||Ar   c                    |                                  }d t          |          D             }t          dg           }| D ]}t          |t                    re|                    t          |                                g                      |D ],}|d                             |t          |          f           -||                    |t          |          f           |S )Nc              3       K   | ]	\  }}|V  
d S rO   rJ   )r[   r1   r0   s      r   	<genexpr>zpostag_tree.<locals>.<genexpr>   s&      66s666666r   r`   rd   )leavesr   r   rg   re   rh   next)rV   r!   tag_iternewtreero   subchilds         r   postag_treer      s    KKMME66wu~~666H3mmG 4 4eT"" 	4NN4r22333! ? ?""Hd8nn#=>>>>? NNE4>>23333Nr   binaryTc           	   #     K   | D ]}t          j        |          D ]l\  }}}|                    d          r|r|D ]K}|                    d          r4t          t           j                            ||          |          E d {V  Lmd S )Nbnewsz.sgm)oswalkendswithload_ace_filepathjoin)rootsfmt
skip_bnewsrootdirsfilesfs          r   load_ace_datar      s       I I!# 	I 	ID$}}W%% *  I I::f%% I,RW\\$-B-BCHHHHHHHHHI	II Ir   c           	   #     K   t          dt          j                            |           d                     | dz   }g }t	          |          5 }t          j        |                                          }d d d            n# 1 swxY w Y   |                    d          D ]}|	                    d          j
        }|                    d          D ]}|                    d          dk    rt          |	                    d	          j
                  }	t          |	                    d
          j
                  dz   }
|                    |	|
|f           t	          |           5 }|                                }d d d            n# 1 swxY w Y   t          j        dd|          }d }t          j        d||          }t          j        dd|          }t          j        dd|          }t          j        dd|          }d |D             }|dk    rd}t#          dg           }t%          |          D ]~\  }	}
}|	|k     r|}	|
|	k    r|                    t)          |||	                              |                    t#          d||	|
                                                              |
}|                    t)          ||d                               |V  d S |dk    rd}t#          dg           }t%          |          D ]~\  }	}
}|	|k     r|}	|
|	k    r|                    t)          |||	                              |                    t#          |||	|
                                                              |
}|                    t)          ||d                               |V  d S t+          d          )Nz  - r   z.tmx.rdc.xmlzdocument/entityentity_typeentity_mentionTYPENAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+> c                 `    d|                                  |                                 z
  dz
  z  S )N    )endstart)ms    r   subfunczload_ace_file.<locals>.subfunc   s'    aeegg		)A-..r   z[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" c                     h | ]\  }}}|	S rJ   rJ   )r[   r\   etyps       r   	<setcomp>z load_ace_file.<locals>.<setcomp>  s    444KQ3C444r   r   r   r`   NE
multiclasszbad fmt value)rm   r   r   r   openETrW   getrootfindallr
   textgetintre   readrx   subr   sortedextendr   
ValueError)textfiler   annfileentitiesinfilexmlentityr   mentionr\   r   r   r   entity_typesirn   s                   r   r   r      s<     	
-x((+
-
-...'G H	g )&hv&&(() ) ) ) ) ) ) ) ) ) ) ) ) ) )++/00 ) )kk-((-~~&677 	) 	)G{{6""f,,GLL!566;<<AGLL!3449::Q>AOOQ3K((((	) 
h 6{{}}               6%r400D/ / / 6"GT22D6#R..D 6$d##D6$d##D448444L hC}})) 	 	IAq#1uuAvvKKd1Q3i00111KKT4!9??#4#455666AAM$qrr(++,,,




 
		C}})) 	 	IAq#1uuAvvKKd1Q3i00111KKS$qs)//"3"344555AAM$qrr(++,,,




 )))s$   'BBB(F		FFc           	         t                               |           } t                               |          }d}t          | |          D ]z\  \  }}\  }}||cxk    rdk    rFn nC|s@t          d|dd|dd|            t          d                    ddd                     d}^d}t          d|dd|dd|            {d S )	NFra   z  15r   z  {:15} {:15} {2}z...T)rL   rZ   ziprm   format)correctguessedellipsiswctgts         r   
cmp_chunksr   .  s   ,,W55G,,W55GH11 , ,B!R????s?????  .2...2...1..///)00uEEFFFH*r***r***q**++++, ,r   c                   &    e Zd ZdZddZd Zd ZdS )Maxent_NE_ChunkerrM   r   c                 p    ddl m} || _         |d| d          | _        |                                  d S )Nr   r	   z+chunkers/maxent_ne_chunker_tab/english_ace_/)	nltk.datar
   _fmt_tab_dirload_params)r   r   r
   s      r   r   zMaxent_NE_Chunker.__init__E  sP    """"""	Q3QQQRRr   c                     ddl m}m}  || j                  \  }}}}t	           ||||          |          }t          |          | _        d S )Nr   )BinaryMaxentFeatureEncodingload_maxent_params)alwayson_features)r   )nltk.classify.maxentr   r   r   r   r   rR   )r   r   r   wgtmpglabaonmcs           r   r   zMaxent_NE_Chunker.load_paramsL  ss    XXXXXXXX//>>S#s''SCHHH#
 
 +b999r   c           	          ddl m} | j        j        }|j        }|j        }|j        }|j        }|j        }| j	        } |||||d| d           d S )Nr   )save_maxent_paramsz/tmp/english_ace_r   )tab_dir)
r   r   rR   _classifier	_encoding_weights_mapping_labels	_alwaysonr   )	r   r   classifecgr   r   r   r   r   s	            r   save_paramszMaxent_NE_Chunker.save_paramsU  sx    ;;;;;;,*lkmi3S#7Q37Q7Q7QRRRRRRr   Nr   )rF   rG   rH   rI   r   r   r   rJ   r   r   r   r   @  sU            : : :
S 
S 
S 
S 
Sr   r   r   c                 L    t          |           }|                                 |S rO   )r   r   )r   chunkers     r   build_modelr   b  s&    $$GNr   __main__)r   Tr   )!rI   r   rx   	xml.etreer   r   nltk.tagr   r   nltk.classifyr   ImportErrornltk.chunk.apir   nltk.chunk.utilr   r   r
   nltk.tokenizer   	nltk.treer   r   rL   r*   r8   r   r   r   r   r   r   rF   rJ   r   r   <module>r      s1    
			 				 ' ' ' ' ' ' 3 3 3 3 3 3 3 3	....... 	 	 	D	 ( ' ' ' ' ' & & & & & &       ' ' ' ' ' '      X X X X X/ X X Xv8 8 8 8 8L 8 8 8v       I I I ID* D* D*R, , ,$S S S S S S S SD   (T zKK s   ! ))