
    NgF:                     H   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
mZmZ d dlmZmZ d Zd Zd Zd	 Zd
 Zd Zd Zd Zd Zd Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d dZd Zd!dZ eddg          Z eg d          Z d Z!e"dk    r e             dS dS )"    N)treebank)BrillTaggerTrainerRegexpTaggerUnigramTagger)PosWord)Template
error_listc                  "    t                       dS )z
    Run a demo with defaults. See source comments for details,
    or docstrings of any of the more specific demo_* functions.
    Npostag     I/var/www/html/ai-engine/env/lib/python3.11/site-packages/nltk/tbl/demo.pydemor      s    
 HHHHHr   c                  &    t          d           dS )N
    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
    repr
ruleformatNr   r   r   r   demo_repr_rule_formatr      s     fr   c                  &    t          d           dS )r   strr   Nr   r   r   r   demo_str_rule_formatr   $   s     er   c                  &    t          d           dS )z*
    Exemplify Rule.format("verbose")
    verboser   Nr   r   r   r   demo_verbose_rule_formatr   +   s     i      r   c                  `    t          t          t          g d                    g           dS )a  
    The feature/s of a template takes a list of positions
    relative to the current word where the feature should be
    looked for, conceptually joined by logical OR. For instance,
    Pos([-1, 1]), given a value V, will hold whenever V is found
    one step to the left and/or one step to the right.

    For contiguous ranges, a 2-arg form giving inclusive end
    points can also be used: Pos(-3, -1) is the same as the arg
    below.
    )	templatesN)r   r	   r   r   r   r   demo_multiposition_featurer$   2   s2     hs<<<00112333333r   c            	      ~    t          t          t          dg          t          ddg                    g           dS )z8
    Templates can have more than a single feature.
    r   r    r!   r"   N)r   r	   r   r   r   r   r   demo_multifeature_templater&   A   s:     htQCyy#r2h--889::::::r   c                  (    t          dd           dS )ah  
    Show aggregate statistics per template. Little used templates are
    candidates for deletion, much used templates may possibly be refined.

    Deleting unused templates is mostly about saving time and/or space:
    training is basically O(T) in the number of templates T
    (also in terms of memory usage, which often will be the limiting factor).
    T)incremental_statstemplate_statsNr   r   r   r   demo_template_statisticsr*   H   s     T$777777r   c                  >   t          j        g dddgd          } t          j        g dddgd          }t          t	          j        | |gd	                    }t          d
                    t          |                               t          |dd           dS )a	  
    Template.expand and Feature.expand are class methods facilitating
    generating large amounts of templates. See their documentation for
    details.

    Note: training with 500 templates can easily fill all available
    even on relatively small corpora
    )r!   r      r,      F)excludezero)r    r!   r   r,   T)r,      )combinationsz8Generated {} templates for transformation-based learning)r#   r(   r)   N)	r   expandr   listr	   printformatlenr   )wordtplstagtplsr#   s      r   demo_generated_templatesr8   T   s     {:::1v5AAAHj!QTBBBGX_h%8vNNNOOI	BII	NN	
 	
  
 Y$tLLLLLLr   c                  *    t          ddd           dS )z
    Plot a learning curve -- the contribution on tagging accuracy of
    the individual rules.
    Note: requires matplotlib
    Tzlearningcurve.png)r(   separate_baseline_datalearning_curve_outputNr   r   r   r   demo_learning_curver<   h   s.     #1     r   c                  &    t          d           dS )zW
    Writes a file with context for each erroneous word after tagging testing data
    z
errors.txt)error_outputNr   r   r   r   demo_error_analysisr?   u   s     %%%%%%r   c                  &    t          d           dS )zm
    Serializes the learned tagger to a file in pickle format; reloads it
    and validates the process.
    z
tagger.pcl)serialize_outputNr   r   r   r   demo_serialize_taggerrB   |   s    
 L))))))r   c                  *    t          ddd           dS )z
    Discard rules with low accuracy. This may hurt performance a bit,
    but will often produce rules which are more interesting read to a human.
    i  gQ?
   )	num_sentsmin_acc	min_scoreNr   r   r   r   demo_high_accuracy_rulesrH      s    
 T42666666r     ,  r/   皙?Fr   c           	      	   |pt           }| ddlm}m}  |            } t	          |||||          \  }}}}|rt
          j                            |          spt          ||          }t          |d          5 }t          j        ||           ddd           n# 1 swxY w Y   t          d                    |                     t          |          5 }t          j        |          }t          d|            ddd           n# 1 swxY w Y   n t          ||          }t          d           |r5t          d	                    |                    |                               t!          j                    }t#          || ||	
          }t          d           |                    ||||          }t          dt!          j                    |z
  dd           |r%t          d|                    |          z             |dk    r`t          d           t'          |                                d          D ].\  }}t          |dd|                    |	          d           /|
rt          d           |                    ||          \  } }!t          d           |st          d           |                                }"|r|                    |!           |r%t1          ||!|"|           t          d|            n:t          d           |                    |          } |r|                                 |t          |d          5 }#|#                    d|z             |#                    d                    t9          ||                                         d          dz              ddd           n# 1 swxY w Y   t          d|            ||                    |          } t          |d          5 }t          j        ||           ddd           n# 1 swxY w Y   t          d|            t          |          5 }t          j        |          }$ddd           n# 1 swxY w Y   t          d|            |                    |          }%| |%k    rt          d            dS t          d!           dS dS )"a
  
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    Nr   )brill24describe_template_sets)backoffwz)Trained baseline tagger, pickled it to {}zReloaded pickled tagger from zTrained baseline taggerz!    Accuracy on test set: {:0.4f}r   zTraining tbl tagger...zTrained tbl tagger in z0.2fz secondsz    Accuracy on test set: %.4fr,   z
Learned rules: 4d szJIncrementally tagging the test data, collecting individual rule statisticsz    Rule statistics collectedzbWARNING: train_stats asked for separate_baseline_data=True; the baseline will be artificially high)takez Wrote plot of learning curve to zTagging the test datazErrors for Brill Tagger %r


zutf-8z)Wrote tagger errors including context to zWrote pickled tagger to z4Reloaded tagger tried on test set, results identicalz;PROBLEM: Reloaded tagger gave different results on test set)REGEXP_TAGGERnltk.tag.brillrM   rN   _demo_prepare_dataospathexistsr   openpickledumpr3   r4   loadaccuracytimer   train	enumeraterulesbatch_tag_incrementaltrain_statsprint_template_statistics
_demo_plot	tag_sentswritejoinr
   encode)&r#   tagged_datarE   	max_rulesrG   rF   rb   trace	randomizer   r(   r)   r>   rA   r;   learning_curve_takebaseline_backoff_taggerr:   cache_baseline_taggerrM   rN   training_databaseline_data	gold_datatesting_databaseline_taggerprint_rulestbrilltrainerbrill_taggerrulenorule
taggedtest	teststats
trainstatsfbrill_tagger_reloadedtaggedtest_reloadeds&                                         r   r   r      s   p 6FBBBBBBBB
 GII	>PUIy2H? ?;]M9l  )w~~344 
	+'>  O +S11 :[O[999: : : : : : : : : : : : : : :;BB)   
 '(( 	KK$k+66OI2GIIJJJ	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K (?VWWW'((( 
/66((33 	
 	
 	
 Y[[F Ej  G 

"###==	9gNNL	
F49;;#7
F
F
F
FGGG S.1F1Fy1Q1QQRRR zz!"""%l&8&8&:&:A>> 	> 	>LFDV<<<Z!8!8<<<====  5X	
 	
 	
 #/"D"D)#
 #
Y 	-...% 	,   "--//
 	>229===  	N%y*CV    L5JLLMMM%&&&!++L99
 	522444 ,$$ 	YGG47GGHHHGGDIIjJ??@@GGPPSWWXXX	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	Y 	H,HHIII #!++L99
"C(( 	3KKk222	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3 	3;)9;;<<<"## 	={$*K$<$<!	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	= 	=@.>@@AAA*44\BB,,,HIIIIIOPPPPP $#s[   8BB!B'D		DDA%OOOP..P25P2Q;;Q?Q?c                    | "t          d           t          j                    } |t          |           |k    rt          |           }|r5t	          j        t          |                      t	          j        |            t          ||z            }| d |         }| ||         }d |D             }|s|}	n&t          |          dz  }
|d |
         ||
d          }}	t          |          \  }}t          |          \  }}t          |	          \  }}t          d|dd|dd           t          d|dd|dd           t          d		                    |||rd
nd                     ||	||fS )Nz%Loading tagged data from treebank... c                 &    g | ]}d  |D             S )c                     g | ]
}|d          S )r   r   ).0ts     r   
<listcomp>z1_demo_prepare_data.<locals>.<listcomp>.<listcomp>a  s    (((aQqT(((r   r   )r   sents     r   r   z&_demo_prepare_data.<locals>.<listcomp>a  s'    ???T((4(((???r   r/   zRead testing data (dz sents/z wds)zRead training data (z-Read baseline data ({:d} sents/{:d} wds) {:s} z[reused the training set])
r3   r   tagged_sentsr5   randomseedshuffleintcorpus_sizer4   )rm   rb   rE   rp   r:   cutoffrt   rv   rw   ru   	bl_cutoff	trainseqstraintokenstestseqs
testtokensbltrainseqsbltraintokenss                    r   rX   rX   Q  s   
 5666+--C,,	99$$	 $C$$%%%{###U"##F(MF9,-I??Y???L! 
%&&!+	*9*%)**% &  +=99Y(66Xz#.}#=#= [-	
F
F
F
F:
F
F
F
FGGG	
I
I
I
I[
I
I
I
IJJJ	7>>(IBB.I	
 	
   =)\BBr   c                    d         g}d         D ] }|                     |d         |z
             !fd|d |         D             }d         g}d         D ] }|                     |d         |z
             !fd|d |         D             }dd lm} t          t	          t          |                              }|                    ||||           |                    g d           |                    |            d S )Ninitialerrors
rulescoresr!   c                 ,    g | ]}d |d         z  z
  S r,   
tokencountr   )r   xr   s     r   r   z_demo_plot.<locals>.<listcomp>}  s(    KKKQQ<000KKKr   c                 ,    g | ]}d |d         z  z
  S r   r   )r   r   r   s     r   r   z_demo_plot.<locals>.<listcomp>  s(    NNNq!a*\222NNNr   r   )NNNg      ?)	appendmatplotlib.pyplotpyplotr2   ranger5   plotaxissavefig)	r;   r   r   rT   	testcurve	rulescore
traincurvepltrs	    ``      r   rh   rh   y  sA   ?+,I|, 4 4	223333KKKK)ETE:JKKKI_-.J- 6 6	*R.945555NNNNJuu<MNNNJ######U3y>>""##AHHQ	1j)))HH$$$%%%KK%&&&&&r   z^-?[0-9]+(\.[0-9]+)?$CDz.*NN)	r   )z(The|the|A|a|An|an)$AT)z.*able$JJ)z.*ness$r   )z.*ly$RB)z.*s$NNS)z.*ing$VBG)z.*ed$VBDr   c                 R    t          |           t          d | D                       fS )Nc              3   4   K   | ]}t          |          V  d S )N)r5   )r   r   s     r   	<genexpr>zcorpus_size.<locals>.<genexpr>  s(      00a3q66000000r   )r5   sum)seqss    r   r   r     s*    IIs0040000011r   __main__)NNrI   rJ   r/   NrK   r/   Fr   FFNNNrJ   NFN)NN)#rY   r]   r   ra   nltk.corpusr   nltk.tagr   r   r   rW   r   r   nltk.tblr	   r
   r   r   r   r   r$   r&   r*   r8   r<   r?   rB   rH   r   rX   rh   NN_CD_TAGGERrV   r   __name__r   r   r   <module>r      s1   
			                D D D D D D D D D D $ $ $ $ $ $ $ $ ) ) ) ) ) ) ) )      ! ! !4 4 4; ; ;	8 	8 	8M M M(
 
 
& & &* * *7 7 7 

  'BQ BQ BQ BQJ%C %C %CP' ' ' '& |=}MNN
 
 
 2 2 2 z r   