
    NgP                     8   d dl Z d dlmZ d dlmZ d dlmZ d dlm	Z	 d Z G d d          Z
d	 Z	 ddZ e j        d          ZddZd Z	 ddZd Z e j        de j                  Z e j        d          Zd Zg ddfdZd Zedk    r e             dS dS )    N)accuracy)map_tag)	str2tuple)Treec                     g }g }|D ]M}|                      |                                          }|t          |          z  }|t          |          z  }Nt          ||          S )a|  
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    )parseflattentree2conlltags	_accuracy)chunkergold	gold_tags	test_tags	gold_tree	test_trees         K/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/chunk/util.pyr   r      sr     II / /	MM)"3"3"5"566	^I...	^I...		 Y	***    c                   h    e Zd ZdZd Zd Zd Zd Zd Zd Z	dd	Z
d
 Zd Zd Zd Zd Zd Zd ZdS )
ChunkScorea;  
    A utility class for scoring chunk parsers.  ``ChunkScore`` can
    evaluate a chunk parser's output, based on a number of statistics
    (precision, recall, f-measure, misssed chunks, incorrect chunks).
    It can also combine the scores from the parsing of multiple texts;
    this makes it significantly easier to evaluate a chunk parser that
    operates one sentence at a time.

    Texts are evaluated with the ``score`` method.  The results of
    evaluation can be accessed via a number of accessor methods, such
    as ``precision`` and ``f_measure``.  A typical use of the
    ``ChunkScore`` class is::

        >>> chunkscore = ChunkScore()           # doctest: +SKIP
        >>> for correct in correct_sentences:   # doctest: +SKIP
        ...     guess = chunkparser.parse(correct.leaves())   # doctest: +SKIP
        ...     chunkscore.score(correct, guess)              # doctest: +SKIP
        >>> print('F Measure:', chunkscore.f_measure())       # doctest: +SKIP
        F Measure: 0.823

    :ivar kwargs: Keyword arguments:

        - max_tp_examples: The maximum number actual examples of true
          positives to record.  This affects the ``correct`` member
          function: ``correct`` will not return more than this number
          of true positive examples.  This does *not* affect any of
          the numerical metrics (precision, recall, or f-measure)

        - max_fp_examples: The maximum number actual examples of false
          positives to record.  This affects the ``incorrect`` member
          function and the ``guessed`` member function: ``incorrect``
          will not return more than this number of examples, and
          ``guessed`` will not return more than this number of true
          positive examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - max_fn_examples: The maximum number actual examples of false
          negatives to record.  This affects the ``missed`` member
          function and the ``correct`` member function: ``missed``
          will not return more than this number of examples, and
          ``correct`` will not return more than this number of true
          negative examples.  This does *not* affect any of the
          numerical metrics (precision, recall, or f-measure)

        - chunk_label: A regular expression indicating which chunks
          should be compared.  Defaults to ``'.*'`` (i.e., all chunks).

    :type _tp: list(Token)
    :ivar _tp: List of true positives
    :type _fp: list(Token)
    :ivar _fp: List of false positives
    :type _fn: list(Token)
    :ivar _fn: List of false negatives

    :type _tp_num: int
    :ivar _tp_num: Number of true positives
    :type _fp_num: int
    :ivar _fp_num: Number of false positives
    :type _fn_num: int
    :ivar _fn_num: Number of false negatives.
    c                    t                      | _        t                      | _        t                      | _        t                      | _        t                      | _        |                    dd          | _        |                    dd          | _        |                    dd          | _	        |                    dd          | _
        d| _        d| _        d| _        d| _        d| _        d| _        d	| _        d S )
Nmax_tp_examplesd   max_fp_examplesmax_fn_exampleschunk_labelz.*r   g        F)set_correct_guessed_tp_fp_fnget_max_tp_max_fp_max_fn_chunk_label_tp_num_fp_num_fn_num_count_tags_correct_tags_total_measuresNeedUpdate)selfkwargss     r   __init__zChunkScore.__init__r   s    555555zz"3S99zz"3S99zz"3S99"JJ}d;; #(   r   c                 4   | j         r| j        | j        z  | _        | j        | j        z
  | _        | j        | j        z
  | _        t          | j                  | _        t          | j                  | _        t          | j                  | _	        d| _         d S d S )NF)
r-   r   r   r   r!   r    lenr'   r(   r)   r.   s    r   _updateMeasureszChunkScore._updateMeasures   s    # 	-}t}4DH}t}4DH}t}4DHtx==DLtx==DLtx==DL',D$$$	- 	-r   c           	         | xj         t          || j        | j                  z  c_         | xj        t          || j        | j                  z  c_        | xj        dz  c_        d| _        	 t          |          }t          |          }n# t          $ r dx}}Y nw xY w| xj        t          |          z  c_        | xj
        t          d t          ||          D                       z  c_
        dS )aU  
        Given a correctly chunked sentence, score another chunked
        version of the same sentence.

        :type correct: chunk structure
        :param correct: The known-correct ("gold standard") chunked
            sentence.
        :type guessed: chunk structure
        :param guessed: The chunked sentence to be scored.
           T c              3   ,   K   | ]\  }}||k    d V  dS )r6   Nr7   ).0tgs      r   	<genexpr>z#ChunkScore.score.<locals>.<genexpr>   s3       "
 "
1aqAvvAvvvv"
 "
r   N)r   
_chunksetsr*   r&   r   r-   r
   
ValueErrorr,   r2   r+   sumzip)r.   correctguessedcorrect_tagsguessed_tagss        r   scorezChunkScore.score   s    	GT[$:KLLLGT[$:KLLLq#' 	-)'22L)'22LL 	- 	- 	- +-,L<<<		-
 	C---c "
 "
l;;"
 "
 "
 
 
 	
s   +B
 
BBc                 :    | j         dk    rdS | j        | j         z  S )z
        Return the overall tag-based accuracy for all text that have
        been scored by this ``ChunkScore``, using the IOB (conll2000)
        tag encoding.

        :rtype: float
        r   r6   )r,   r+   r3   s    r   r   zChunkScore.accuracy   s(     q  1!D$444r   c                 l    |                                   | j        | j        z   }|dk    rdS | j        |z  S )z
        Return the overall precision for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        r   )r4   r'   r(   r.   divs     r   	precisionzChunkScore.precision   ?     	lT\)!881<#%%r   c                 l    |                                   | j        | j        z   }|dk    rdS | j        |z  S )z
        Return the overall recall for all texts that have been
        scored by this ``ChunkScore``.

        :rtype: float
        r   r4   r'   r)   rH   s     r   recallzChunkScore.recall   rK   r         ?c                     |                                   |                                 }|                                 }|dk    s|dk    rdS d||z  d|z
  |z  z   z  S )a  
        Return the overall F measure for all texts that have been
        scored by this ``ChunkScore``.

        :param alpha: the relative weighting of precision and recall.
            Larger alpha biases the score towards the precision value,
            while smaller alpha biases the score towards the recall
            value.  ``alpha`` should have a value in the range [0,1].
        :type alpha: float
        :rtype: float
        r   r6   )r4   rJ   rN   )r.   alphaprs       r   	f_measurezChunkScore.f_measure   sd     	NNKKMM66Q!VV1EAIUa/00r   c                 j    |                                   t          | j                  }d |D             S )z
        Return the chunks which were included in the
        correct chunk structures, but not in the guessed chunk
        structures, listed in input order.

        :rtype: list of chunks
        c                     g | ]
}|d          S r6   r7   r9   cs     r   
<listcomp>z%ChunkScore.missed.<locals>.<listcomp>       %%%!%%%r   )r4   listr!   r.   chunkss     r   missedzChunkScore.missed   s7     	dh%%f%%%%r   c                 j    |                                   t          | j                  }d |D             S )z
        Return the chunks which were included in the guessed chunk structures,
        but not in the correct chunk structures, listed in input order.

        :rtype: list of chunks
        c                     g | ]
}|d          S rW   r7   rX   s     r   rZ   z(ChunkScore.incorrect.<locals>.<listcomp>   r[   r   )r4   r\   r    r]   s     r   	incorrectzChunkScore.incorrect   s7     	dh%%f%%%%r   c                 B    t          | j                  }d |D             S )z
        Return the chunks which were included in the correct
        chunk structures, listed in input order.

        :rtype: list of chunks
        c                     g | ]
}|d          S rW   r7   rX   s     r   rZ   z&ChunkScore.correct.<locals>.<listcomp>  r[   r   )r\   r   r]   s     r   rA   zChunkScore.correct   '     dm$$%%f%%%%r   c                 B    t          | j                  }d |D             S )z
        Return the chunks which were included in the guessed
        chunk structures, listed in input order.

        :rtype: list of chunks
        c                     g | ]
}|d          S rW   r7   rX   s     r   rZ   z&ChunkScore.guessed.<locals>.<listcomp>  r[   r   )r\   r   r]   s     r   rB   zChunkScore.guessed  re   r   c                 H    |                                   | j        | j        z   S )NrM   r3   s    r   __len__zChunkScore.__len__  s#    |dl**r   c                 F    dt          t          |                     z   dz   S )z`
        Return a concise representation of this ``ChunkScoring``.

        :rtype: str
        z<ChunkScoring of z chunks>)reprr2   r3   s    r   __repr__zChunkScore.__repr__  s      #T#d))__4zAAr   c                     dd|                                  dz  ddz   d|                                 dz  ddz   d|                                 dz  ddz   d|                                 dz  dd	z   S )
a-  
        Return a verbose representation of this ``ChunkScoring``.
        This representation includes the precision, recall, and
        f-measure scores.  For other information about the score,
        use the accessor methods (e.g., ``missed()`` and ``incorrect()``).

        :rtype: str
        zChunkParse score:
z    IOB Accuracy: r   z5.1fz%%
z    Precision:    z    Recall:       z    F-Measure:    z%%)r   rJ   rN   rT   r3   s    r   __str__zChunkScore.__str__  s     "DDMMOOc$9DDDDFEDNN$4$4s$:EEEEG CDKKMMC$7BBBBD DDNN$4$4s$:CCCC	E	
r   N)rO   )__name__
__module____qualname____doc__r0   r4   rE   r   rJ   rN   rT   r_   rb   rA   rB   ri   rl   rn   r7   r   r   r   r   3   s        < <|) ) )&- - -
 
 
:
5 
5 
5& & && & &1 1 1 1&
& 
& 
&	& 	& 	&& & && & &+ + +B B B
 
 
 
 
r   r   c                 T   d}g }| D ]}t          |t                    rwt          j        ||                                          r+|                    ||f|                                f           |t          |                                          z  }|dz  }t          |          S )Nr   r6   )

isinstancer   rematchlabelappendfreezer2   leavesr   )r:   countr   posr^   childs         r   r=   r=   2  s    
CF  eT"" 	xU[[]]33 >s|U\\^^<===3u||~~&&&CC1HCCv;;r   NPS/c                    t          j        d          }t          |g           g}|                    |           D ]Y}|                                }	|	d         dk    ryt          |          dk    r%t          d|                                d          t          |g           }
|d                             |
           |                    |
           |	d         dk    rMt          |          d	k    r%t          d
|                                d          |	                                 ||d                             |	           t          |	|          \  }}|r|rt          |||          }|d                             ||f           [t          |          dk    r t          dt          |           d          |d         S )aB  
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    z\[|\]|[^\[\]\s]+r   [r6   zUnexpected [ at char d]   zUnexpected ] at char NzExpected ] at char )ru   compiler   finditergroupr2   r>   startrx   popr   r   )sr   
root_labelsepsource_tagsettarget_tagsetWORD_OR_BRACKETstackrv   textchunkwordtags                r   tagstr2treer   ?  s   ( j!455O*b!!"E ))!,, . .{{}}7c>>5zzQ !J!J!J!JKKKb))E"IU###LL!W^^5zzQ !J!J!J!JKKKIIKKKK{b	  &&&&%dC00	c  E] E!-DDCb	  $----
5zzQ9s1vv999:::8Or   z(\S+)\s+(\S+)\s+([IOB])-?(\S+)?r~   PPVPc                    t          |g           g}t          |                     d                    D ]!\  }}|                                st                              |          }|t          d|d          |                                \  }}}	}
||
|vrd}	|	dk    o|
|d                                         k    }|	dv s|r't          |          d	k    r|
                                 |	d
k    s|r@t          |
g           }|d                             |           |                    |           |d                             ||f           #|d         S )a*  
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    
NzError on line r   OIr   BOr   Br   )r   	enumeratesplitstrip_LINE_RErv   r>   groupsrw   r2   r   rx   )r   chunk_typesr   r   linenolinerv   r   r   state
chunk_type
mismatch_Ir   s                r   conllstr2treer   u  sk   $ *b!!"E!!''$--00 & &zz|| 	 t$$=8f888999).&sE: "z'D'DE c\EjE"IOO4E4E&E
D==J=5zzQ		 C<<:<R((E"IU###LL 	b	$%%%%8Or   c                 N   g }| D ]}	 |                                 }d}|D ]O}t          |t                    rt          d          |                    |d         |d         ||z   f           d}Pl# t
          $ r' |                    |d         |d         df           Y w xY w|S )z
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    B-z7Tree is too deeply nested to be printed in CoNLL formatr   r6   I-r   )rw   rt   r   r>   rx   AttributeError)r:   tagsr}   categoryprefixcontentss         r   r
   r
     s     D 3 3	3{{}}HF!  h-- $Q   Xa[(1+v7HIJJJ  	3 	3 	3KKq58S122222	3Ks   A(A11.B"!B"Fc                 
   t          |g           }| D ]n\  }}}|)|rt          d          |                    ||f           2|                    d          r/|                    t          |dd         ||fg                     v|                    d          rt	          |          dk    sAt          |d         t                     r&|d                                         |dd         k    rA|rt          d          |                    t          |dd         ||fg                      |d                             ||f           ?|dk    r|                    ||f           ^t          d	|          |S )
z1
    Convert the CoNLL IOB format to a tree.
    NzBad conll tag sequencer   r   r   r   r   r   zBad conll tag )r   r>   rx   
startswithr2   rt   rw   )sentencer   r   stricttreer   postagchunktags           r   conlltags2treer     s    
BD"* < <fh , !9::: T6N++++  && 	<KKXabb\T6N+;<<====  && 	<D		Q!$r(D11 8>>##x|33 F$%=>>> KKXabb\T6N3C D DEEEERv////__KKv'''':h::;;;Kr   c                 ^    d t          |           D             }d                    |          S )z
    Return a multiline string where each line contains a word, tag and IOB tag.
    Convert a tree to the CoNLL IOB string format

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: str
    c                 8    g | ]}d                      |          S ) )join)r9   tokens     r   rZ   z!tree2conllstr.<locals>.<listcomp>  s"    <<<SXXe__<<<r   r   )r
   r   )r:   liness     r   tree2conllstrr     s1     =<.*;*;<<<E99Ur   a   <DOC>\s*(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?<BODY>\s*(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?<TEXT>(?P<text>.*?)</TEXT>\s*</BODY>\s*</DOC>\s*z#<b_\w+\s+[^>]*?type="(?P<type>\w+)"c                    t          |g           g}| g S t          j        d|           D ]1}|                                }	 |                    d          rt
                              |          }|t          d|           t          |                    d          g           }|d                             |           |                    |           nE|                    d          r|	                                 n|d                             |           # t          t          f$ r,}t          d|                                dd	          |d }~ww xY wt          |          d
k    rt          d          |d         S )Nz<[^>]+>|[^\s<]+z<b_XXXXtyper   z<e_z$Bad IEER string (error at character r   )r6   zBad IEER stringr   )r   ru   r   r   r   _IEER_TYPE_RErv   printrx   r   
IndexErrorr>   r   r2   )r   r   r   piece_mpiecemr   es           r   _ieer_read_textr     s   *b!!"E 	y	;1155  	&& (!''..9&%(((QWWV__b11b	  '''U####!!%(( (		
 b	  '''J' 	 	 	Kw}}KKKK 	 5zzQ*+++8Os   CDE/'EE)	LOCATIONORGANIZATIONPERSONDURATIONDATECARDINALPERCENTMONEYMEASUREc           	      `   t                               |           }|rt          |                    d          |          |                    d          |                    d          |                    d          t          |                    d          |          dS t          | |          S )ap  
    Return a chunk structure containing the chunked tagged text that is
    encoded in the given IEER style string.
    Convert a string of chunked tagged text in the IEER named
    entity format into a chunk structure.  Chunks are of several
    types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
    PERCENT, MONEY, and MEASURE.

    :rtype: Tree
    r   docnodoctype	date_timeheadline)r   r   r   r   r   )_IEER_DOC_RErv   r   r   )r   r   r   r   s       r   ieerstr2treer   '  s    8 	1A .#AGGFOOZ@@WWW%%wwy))-- (
(;(;ZHH
 
 	
 q*---r   c                  d   d} dd l }|j                            | d          }|                                 t	                       d} t          | d          }|                                 t	          d           t	          |j                            |                     t	                       d S )	Nzd[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./.r   r~   )r   av  
These DT B-NP
research NN I-NP
protocols NNS I-NP
offer VBP B-VP
to TO B-PP
the DT B-NP
patient NN I-NP
not RB O
only RB O
the DT B-NP
very RB I-NP
best JJS I-NP
therapy NN I-NP
which WDT B-NP
we PRP B-NP
have VBP B-VP
established VBN I-VP
today NN B-NP
but CC B-NP
also RB I-NP
the DT B-NP
hope NN I-NP
of IN B-PP
something NN B-NP
still RB B-ADJP
better JJR I-ADJP
. . O
)r~   r   )r   zCoNLL output:)nltkr   r   pprintr   r   r   )r   r   r:   
conll_trees       r   demor   R  s    nAKKK
qd33AHHJJJ	GGG	A< ql;;;J 
/	$*
"
":
.
.///	GGGGGr   __main__)r~   r   r   NN)r   r   )r   r   F)ru   nltk.metricsr   r   nltk.tag.mappingr   nltk.tag.utilr   	nltk.treer   r   r=   r   r   r   r   r
   r   r   DOTALLr   r   r   r   r   ro   r7   r   r   <module>r      s   
			 . . . . . . $ $ $ $ $ $ # # # # # #      + + +<z
 z
 z
 z
 z
 z
 z
 z
~
 
 
 UY. . . .f 2:8992 2 2 2j  8 FK! ! ! !H
 
 
 rz I
 
 
ABB  D
 
 
 (. (. (. (.V, , ,^ zDFFFFF r   